├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.en.md
├── README.md
├── check.go
├── download.go
├── go.mod
├── go.sum
├── http_download.go
├── img
    ├── arch-cn1.png
    ├── arch-cn2.png
    ├── arch-en1.png
    └── arch-en2.png
├── main.go
├── s3tos3.go
├── sqs2trans.go
├── upload.go
└── util.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .*
 3 | __*
 4 | 
 5 | *.yaml
 6 | *.txt
 7 | *.db
 8 | *.log
 9 | s3trans
10 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/README.en.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Amazon S3 Resumable Transfer V2  (Amazon S3 断点续传传输 V2)
  3 | 
  4 | 中文 README: [README.md](README.md)
  5 |   
  6 | Multi-threaded breakpoint resumption, suitable for batch uploading/downloading large files to/from S3 and local/cross-object storage migration. It supports Amazon S3, Alibaba Cloud OSS, Tencent Cloud COS, Google Cloud Storage, Huawei Cloud OBS, and other object storage services compatible with the S3 API. In Version 2, the same application can be configured for various scenarios through configuration: single-machine uploading, single-machine downloading, deployed as a cluster version for scanning source files, or as a distributed transmission worker node in a cluster. It has been refactored with Go for improved performance and supports a range of extended features: exclusion list, source no-sign-request, source request-payer, destination storage-class, destination ACL, and metadata transfer.
  7 | 
  8 | ![img](./img/arch-en1.png)
  9 | ![img](./img/arch-en2.png)
 10 | 
 11 | ## Features
 12 | 
 13 | * Supports multi-threaded concurrent transfers to multiple object storage systems, with resumable transfers, automatic retries, and concurrent multi-file tasks to fully utilize bandwidth. An optimized flow control mechanism is implemented. In a cluster test (10*m5.large instances), 1.2TB of data was migrated from us-east-1 to cn-northwest-1 in just 1 hour. In another single-machine bandwidth test, using an m6i.8xlarge EC2 instance (num-workers 16), a sustained transfer speed of 12Gbps was achieved between two S3 buckets in the same region.
 14 | 
 15 | * Supports sources and destinations including local directories or files, Amazon S3, Alibaba OSS, Tencent COS, Google GCS, and other object storage systems. No need to distinguish between work modes; simply specify the source and destination URLs or local paths, and the transfer will automatically start. Can handle single files or objects, entire directories, S3 buckets/prefixes, etc.
 16 | 
 17 | * Data is transferred through intermediate nodes in single chunk form without being written to disk, saving time and improving security. Supports transfers from 0 bytes up to TB-level sizes.
 18 | 
 19 | * Allows setting various object storage classes for the destination, such as STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW
 20 | 
 21 | * Supports specifying the destination S3 ACL: private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control
 22 | 
 23 | * Supports setting the source object storage as no-sign-request and request-payer.
 24 | 
 25 | * Supports source objects as Presigned URLs or URL lists (please ensure the correct Region is specified when generating Presigned URLs).
 26 | 
 27 | * Supports retrieving and copying metadata from the source object storage to the destination.
 28 | 
 29 | * Automatically compares file names and sizes between source and destination buckets, transferring only mismatched files. By default, it lists and transfers simultaneously, fetching destination object information and transferring one by one, providing an immediate transfer start after entering the command (similar to AWS CLI). Optionally, the -l parameter can be used to list all destination objects before transferring, which is more efficient and reduces request costs.
 30 | 
 31 | * Version 2 now supports multi-threaded parallel listing, significantly speeding up the listing process for buckets with large numbers of objects. For example, a bucket with 30 million objects that would normally take over 90 minutes to list (e.g., with aws s3 ls) can now be listed in just 1-2 minutes using 64 concurrent threads (16 vCPU).
 32 | 
 33 | * Supports saving the compared task list to a file, saving logs of tasks sent to SQS to a file, setting an exclusion list to skip transferring keys or local paths matching the list, a DRYRUN mode to compare sources and destinations without transferring data, and a mode to overwrite destinations without comparison.
 34 | 
 35 | * Supports setting a resumable transfer threshold, parallel thread count, request timeout, maximum retry count, and an option to ignore confirmation prompts and execute directly.
 36 | 
 37 | ## Usage
 38 | 
 39 | ### Install Go Runtime  
 40 | 
 41 | For first time use, install Golang runtime, example for Linux:
 42 | 
 43 | ```shell
 44 | sudo yum install go git -y
 45 | git clone https://github.com/aws-samples/amazon-s3-resumable-upload
 46 | ```
 47 | 
 48 | For China regions, use go proxy to speed up downloading go packages, add:  
 49 | 
 50 | ```go
 51 | go env -w GOPROXY=https://goproxy.cn,direct   
 52 | ```
 53 | 
 54 | ### Compile Go Code
 55 | 
 56 | ```shell
 57 | cd amazon-s3-resumable-upload
 58 | go build .  # downloads dependencies and compiles
 59 | ```
 60 | 
 61 | Use ./s3trans -h to see help
 62 | 
 63 | ### Quick Start
 64 | 
 65 | * Download S3 file to local:
 66 | 
 67 | ```shell
 68 | ./s3trans s3://bucket-name/prefix /local/path
 69 | # Above uses default AWS profile in ~/.aws/credentials, or IAM Role if on EC2. To specify profile for source S3:  
 70 | ./s3trans s3://bucket-name/prefix /local/path --from_profile=source_profile
 71 | ```
 72 | 
 73 | * Upload local file to S3:
 74 | 
 75 | ```shell
 76 | ./s3trans /local/path s3://bucket-name/prefix 
 77 | # Above uses default AWS profile in ~/.aws/credentials, or IAM Role if on EC2. To specify profile for destination S3:
 78 | ./s3trans /local/path s3://bucket-name/prefix --to_profile=dest_profile 
 79 | ```
 80 | 
 81 | * S3 to S3, region is auto detected if not specified:
 82 | 
 83 | ```shell
 84 | ./s3trans s3://bucket-name/prefix s3://bucket-name/prefix --to_profile=dest_profile
 85 | # Above from_profile not set uses default or EC2 IAM Role. Can also specify both:
 86 | ./s3trans s3://bucket-name/prefix s3://bucket-name/prefix --from_profile=source_profile --to_profile=dest_profile 
 87 | ```
 88 | 
 89 | * For non-AWS S3 compatible storage, specify endpoint:
 90 | 
 91 | ```shell
 92 | ./s3trans s3://bucket-gcs-test s3://bucket-virginia --from_profile=gcs_profile --to_profile=aws_profile --from_endpoint=https://storage.googleapis.com
 93 | # Can use short names for endpoints, e.g. --from_endpoint=google_gcs, also supports: ali_oss, tencent_cos, azure_blob(TODO: azure)
 94 | ```
 95 | 
 96 | * -l to list target before transfer (less API calls but slower start)
 97 | * -n (n is NumWorkers) to specify concurrency for listing and transfers. Max concurrent objects is n, max concurrent parts per object is 4n, max concurrent listing is 4n. Recommend n <= vCPU number  
 98 | * -y to auto confirm prompt
 99 | 
100 | ```shell
101 | ./s3trans C:\Users\Administrator\Downloads\test\ s3://huangzb-virginia/win2/ --to-profile sin  -l -n 8 -y 
102 | ```
103 | 
104 | ## Download from Presign URL or URL list
105 | 
106 | Download a single presigned URL (presigned URL) concurrently:
107 | 
108 | ```shell
109 | ./s3trans "https://your_bucket.s3.region.amazonaws.com/prefix/filename?X-Amz-Algorithm=xxxx&&X-Amz-Credential=xxxx&&X-Amz-Date=xxxx&&X-Amz-Expires=xxxx&X-Amz-SignedHeaders=host&X-Amz-Signature=xxxx" /localpath_download_to/
110 | ```
111 | 
112 | Download from a list of URLs. In the example below, the URL list file is named list_file.txt, with each line containing a presigned URL:
113 | 
114 | ```shell
115 | ./s3trans /mypath/list_file.txt /localpath_download_to/
116 |     --work-mode HTTP_DOWNLOAD_LIST
117 | ```
118 | 
119 | ## More usage help
120 | 
121 | ```shell
122 | ./s3trans -h
123 | 
124 | s3trans transfers data from source to target.
125 |     ./s3trans FROM_URL TO_URL [OPTIONS]
126 |     FROM_URL: The url of data source, e.g. /home/user/data or s3://bucket/prefix
127 |     TO_URL: The url of data transfer target, e.g. /home/user/data or s3://bucket/prefix
128 |     For example:
129 |     ./s3trans s3://bucket/prefix s3://bucket/prefix -from_profile sin -to_profile bjs
130 |     ./s3trans s3://bucket/prefix /home/user/data -from_profile sin
131 | 
132 | Usage:
133 |   s3trans FROM_URL TO_URL [flags]
134 | 
135 | Flags:
136 |       --acl string                The TARGET S3 bucket ACL, private means only the object owner can read&write, e.g. private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control
137 |       --from-endpoint string      The endpoint of data source, e.g. https://storage.googleapis.com; https://oss-<region>.aliyuncs.com; https://cos.<region>.myqcloud.com . If AWS s3 or local path, no need to specify this.
138 |       --from-profile string       The AWS profile in ~/.aws/credentials of data source
139 |       --force-path-style          Set this to true to force the request to use path-style addressing See http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html
140 |       --from-region string        The region of data transfer source, e.g. cn-north-1. If no specified, the region will be auto detected with the credentials you provided in profile.
141 |   -h, --help                      help for s3trans
142 |       --http-timeout int          API request timeout (seconds) (default 30)
143 |   -l, --list-target               List the TARGET S3 bucket, compare exist objects BEFORE transfer. List is more efficient than head each object to check if it exists, but transfer may start slower because it needs to wait for listing all objects to compare. To mitigate this, this app leverage Concurrency Listing for fast list; If no list-target para, transfer without listing the target S3 bucket, but before transfering each object, head each target object to check, this costs more API call, but start faster.
144 |       --max-retries int           API request max retries (default 5)
145 |       --no-sign-request           The SOURCE bucket is not needed to sign the request
146 |   -n, --num-workers int           NumWorkers*1 for concurrency files; NumWorkers*4 for parts of each file and for listing target bucket; Recommend NumWorkers <= vCPU number (default 4)
147 |       --request-payer             The SOURCE bucket requires requester to pay, set this
148 |       --resumable-threshold int   When the file size (MB) is larger than this value, the file will be resumable transfered. (default 50)
149 |   -s, --skip-compare              If True, skip to compare the name and size between source and target S3 object. Just overwrite all objects. No list target nor head target object to check if it already exists.
150 |       --sqs-profile string        The SQS queue leverage which AWS profile in ~/.aws/credentials
151 |       --sqs-url string            The SQS queue URL to send or consume message from, e.g. https://sqs.us-east-1.amazonaws.com/my_account/my_queue_name
152 |       --storage-class string      The TARGET S3 bucket storage class, e.g. STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW or others of S3 compatibale
153 |       --to-endpoint string        The endpoint of data transfer target, e.g. https://storage.googleapis.com . If AWS s3 or local path, no need to specify this.
154 |       --to-profile string         The AWS profile in ~/.aws/credentials of data transfer target
155 |       --to-region string          The region of data transfer target, e.g. us-east-1. If no specified, the region will be auto detected with the credentials you provided in profile.
156 |       --transfer-metadata         If True, get metadata from source S3 bucket and upload the metadata to target object. This costs more API calls.
157 |       --work-mode string          SQS_SEND | SQS_CONSUME | DRYRUN | HTTP_DOWNLOAD_LIST; SQS_SEND means listing source FROM_URL S3 and target TO_URL S3 to compare and send message to SQS queue, SQS_CONSUME means consume message from SQS queue and transfer objects from FROM_URL S3 to TO_URL S3; DRYRUN means only count the objects and sizes comparing delta list of FROM_URL S3 and TO_URL S3, no transfer; HTTP_DOWNLOAD_LIST, from a list file with lines of presign url
158 |   -y, --y                         Ignore waiting for confirming command
159 | ```
160 | 
161 | ## License
162 |   
163 | This library is licensed under the MIT-0 License. See the LICENSE file.
164 |   
165 |   ******
166 |   Author: Huang, Zhuobin (James)
167 |   ******
168 |   


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Amazon S3 Resumable Transfer V2  (Amazon S3 断点续传传输 V2)
  2 | 
  3 | English README: [README.en.md](README.en.md)
  4 | 
  5 | 多线程断点续传，适合批量的大文件S3上传/下载本地/跨对象存储迁移，支持Amazon S3, Ali OSS, Tencent COS, Google GCS, HuaweiCloud 等兼容S3 API的对象存储
  6 | 本 Version 2 在同一个应用通过配置即可用做各种场景：单机的上传，单机的下载，部署为集群版的扫描源文件，或作为集群版的分布式传输工作节点；用Golang做了重构，提高性能；支持了一系列扩展功能：排除列表、源no-sign-request、源request-payer、目的storage-class、目的ACL、传输 Metadata 等。
  7 | 
  8 | ![img](./img/arch-cn1.png)
  9 | ![img](./img/arch-cn2.png)
 10 | 
 11 | ## 功能  
 12 | 
 13 | * 多线程并发传输到多种对象存储，断点续传，自动重传。多文件任务并发，充分利用带宽。优化的流控机制。在一次集群测试中(10台m5.large)，迁移1.2TB数据从 us-east-1 到 cn-northwest-1 只用1小时。在另一个单机带宽的测试中，同 Region 的两个 S3 用 m6i.8xlarge EC2 传输 (num-workers 16)，跑出了单机持续 12Gbps 的传输速度。
 14 | 
 15 | * 支持的源和目的地：本地目录或单个文件, Amazon S3, Ali OSS, Tencent COS, Google GCS 等对象存储。无需区分工作模式，指定好源和目的URL或本地路径即可自动识别并开始传输。可以是单个文件或对象，或整个目录，或S3桶/前缀等URL。
 16 | 
 17 | * 传输数据只以单个分片的形式过中转节点的内存，不落盘到节点，节省时间且更安全。可支撑 0 Size 至 TB 级别 。  
 18 | 
 19 | * 支持设置目的地的各种对象存储级别，如：STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW
 20 | 
 21 | * 支持指定目的S3的ACL: private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control
 22 | 
 23 | * 支持设置源对象存储是no-sign-request和request-payer
 24 | 
 25 | * 支持源对象是 Presign URL 或 URL 列表（注意生成Presign URL的时候指定 Bucket 正确的 Region）
 26 | 
 27 | * 支持获取源对象存储的 Metadata 也复制到目的对象存储。但要注意这个需要每个对象都Head去获取一次，会影响性能和增加对源S3的请求次数费用。
 28 | 
 29 | * 自动对比源/目的桶的文件名和大小，不一致的才传输。默认是一边List，一边传输，即逐个获取目的对象信息对比一个就传输一个，这样使用体验是输入命令之后就立马启动传输（类似AWS CLI）；可选设置 -l 参数，为List目的对象列表之后再进行传输，因为List比逐个Head对比效率更高，也节省请求次数的费用。
 30 | 
 31 | * 本次 Version 2 支持了多线程并行 List ，对于对象数量很多的情况，可以更快完成List。例如3千万对象的桶，如果按正常 List（例如 aws s3 ls）要90分钟以上，而现在在使用64并发的情况(16vCPU)下缩减到只有 1 到 2 分钟。
 32 | 
 33 | * 支持把对比扫描出来的任务列表存入文件；支持把已发送到SQS的日志存入文件；支持设置排除列表，如果数据源Key或源本地路径符合排除列表的则不传输；支持DRYRUN模式，只比较源和目的桶，统计数量和Size，不传输数据；支持不做对比不检查目的对象，直接覆盖的模式。
 34 | 
 35 | * 支持设置断点续传阈值；设置并行线程数；设置请求超时时间；设置最大重试次数；支持设置是否忽略确认命令，直接执行；
 36 | 
 37 | ## 使用说明
 38 | 
 39 | ### 安装Go运行环境
 40 | 
 41 | 首次使用需要安装Golang运行环境，以Linux为例：
 42 | 
 43 | ```shell
 44 | sudo yum install go git -y
 45 | ```
 46 | 
 47 | 如果在中国区，可通过go代理来下载go依赖包，则多运行一句代理设置：
 48 | 
 49 | ```go
 50 | go env -w GOPROXY=https://goproxy.cn,direct   
 51 | ```
 52 | 
 53 | ### 下载和编译本项目的Go代码
 54 | 
 55 | ```shell
 56 | git clone https://github.com/aws-samples/amazon-s3-resumable-upload
 57 | cd amazon-s3-resumable-upload
 58 | go build .  # 下载依赖包并编译程序
 59 | ```
 60 | 
 61 | 可使用 ./s3trans -h 获取帮助信息
 62 | 
 63 | ### 使用  
 64 | 
 65 | * 下载S3文件到本地：  
 66 | 
 67 | ```shell
 68 | ./s3trans s3://bucket-name/prefix /local/path 
 69 | # 以上是使用默认AWS profile in ~/.aws/credentials，如果是EC2且没有配置 profile 而是使用IAM Role，需指定一下 Region
 70 | ./s3trans s3://bucket-name/prefix /local/path --from-region=my_region
 71 | # 如果要指定S3的profile则如下：
 72 | ./s3trans s3://bucket-name/prefix /local/path --from-profile=source_profile
 73 | ```
 74 | 
 75 | * 上传本地文件到S3：  
 76 | 
 77 | ```shell
 78 | ./s3trans /local/path s3://bucket-name/prefix
 79 | # 以上是使用默认AWS profile in ~/.aws/credentials，如果是EC2且没有配置 profile 而是使用IAM Role，需指定一下 Region
 80 | ./s3trans /local/path s3://bucket-name/prefix  --to-region=my_region
 81 | # 如果要指定S3的profile则如下：
 82 | ./s3trans /local/path s3://bucket-name/prefix --to-profile=dest_profile
 83 | ```
 84 | 
 85 | * 从S3到S3，如不指定region，则程序会先自动查询Bucket的Region：  
 86 | 
 87 | ```shell
 88 | ./s3trans s3://bucket-name/prefix s3://bucket-name/prefix --from-profile=source_profile --to-profile=dest_profile
 89 | # 如果from-profile不填则获取默认的profile或使用EC2 IAM Role，需指定一下region
 90 | ./s3trans s3://bucket-name/prefix s3://bucket-name/prefix --from-region=my_region --to-profile=dest_profile
 91 | ```
 92 | 
 93 | * 对于非AWS的S3兼容存储，则需要指定endpoint
 94 | 
 95 | ```shell
 96 | ./s3trans s3://bucket-gcs-test s3://bucket-virginia --from-profile=gcs_profile --to-profile=aws_profile --from-endpoint=https://storage.googleapis.com
 97 | # 以上endpoint也可以用简称替换，即：--from-endpoint=google_gcs，还可以是其他简称：ali_oss, tencent_cos, azure_blob(TODO: azure)
 98 | ```
 99 | 
100 | * -l 指定先List再同步数据（节省请求次数费用，但会增加一次List的时间）
101 | * -n （n 即NumWorkers）指定并行List和并行传输线程数。最大并发对象数为n，每个对象最大并发为2n，List Bucket时最大并发为4n；推荐n <= vCPU numbe
102 | * -y 忽略确认命令，直接执行
103 | 
104 | ```shell
105 | ./s3trans C:\Users\Administrator\Downloads\test\ s3://huangzb-virginia/win2/ --to-profile sin  -l -n 8 -y
106 | ```
107 | 
108 | * 支持设置排除列表 (--ignore-list-path) 如果数据源的S3 Key或源本地路径符合排除列表的则不传输
109 | 例如，排除列表路径设置为 --ignore-list-path="./ignore-list.txt" 文件内容为：  
110 | 
111 | ```text
112 | test2/
113 | test1
114 | ```
115 | 
116 | 则源数据中遇到这些路径都会被跳过，不传输：test2/abc.zip, test1/abc.zip, test1, test1.zip, test2/cde/efg等...
117 | 而这些路径则会正常传输，因为开头Prefix不一致：test3/test1, test3/test2/ 等...
118 | 
119 | ## 集群模式
120 | 
121 | ### 集群模式的 List 模块使用
122 | 
123 | 对比源Bucket/Prefix和目的Bucket/Prefix，把不一致的对象信息写入SQS队列，以便后续的传输节点使用。  
124 | 需要指定源S3和目的S3的URL，另还需要指定一个SQS用于发送任务列表，包括SQS的url和能访问这个SQS所用的AWS profile。不指定 sqs profile 则程序会自动从 EC2 IAM Role 获取权限，Region名称会从sqs-url自动提取。  
125 | 可选：  
126 | 设置把对比扫描出来的任务列表存入文件 --joblist-write-to-filepath；  
127 | 设置把SQS发送的日志存入文件 --sqs-log-to-filename  
128 | 
129 | ```shell
130 | ./s3trans s3://from_bucket/ s3://to_bucket/prefix --from-profile us --to-profile bjs \
131 |     --work-mode SQS_SEND
132 |     --sqs-profile us \
133 |     --sqs-url "https://sqs.region.amazonaws.com/my_account_number/sq_queue_sname" \
134 |     --joblist-write-to-filepath "./my_joblist.log" \
135 |     --sqs-log-to-filename  "./sqssent.log" \
136 |     -y -l -n 8
137 | ```
138 | 
139 | ### 集群模式的传输节点使用
140 | 
141 | 从SQS队列中获取任务列表，然后传输数据。需要指定源S3和目的S3的URL，另还需要指定一个SQS用于发送任务列表，包括SQS的url和能访问这个SQS所用的AWS profile。不指定 sqs profile 则程序会自动从 EC2 IAM Role 获取权限，Region名称会从sqs-url自动提取。  
142 | 
143 | ```shell
144 | ./s3trans s3://from_bucket/prefix s3://to_bucket/ --from-profile us --to-profile bjs \
145 |     --work-mode SQS_CONSUME
146 |     --sqs-profile us \
147 |     --sqs-url "https://sqs.region.amazonaws.com/my_account_number/sq_queue_sname" \
148 |     -y -l -n 8
149 | ```
150 | 
151 | ## 下载从 Presign URL 或 URL 列表
152 | 
153 | 多线程并发下载单一个 URL (presign url)
154 | 
155 | ```shell
156 | ./s3trans "https://your_bucket.s3.region.amazonaws.com/prefix/filename?X-Amz-Algorithm=xxxx&&X-Amz-Credential=xxxx&&X-Amz-Date=xxxx&&X-Amz-Expires=xxxx&X-Amz-SignedHeaders=host&X-Amz-Signature=xxxx" /localpath_download_to/
157 | ```
158 | 
159 | 多线程并发按照 URL 列表下载，下例子中 URL 列表文件名为 list_file.txt，文件中每行为一个 presign URL
160 | 
161 | ```shell
162 | ./s3trans /mypath/list_file.txt /localpath_download_to/
163 |     --work-mode HTTP_DOWNLOAD_LIST
164 | ```
165 | 
166 | ## 其他使用帮助
167 | 
168 | ./s3trans -h  
169 |   
170 | s3trans 从源传输数据到目标  
171 |   ./s3trans FROM_URL TO_URL [OPTIONS]  
172 |   FROM_URL: 数据源的URL，例如 /home/user/data or s3://bucket/prefix  
173 |   TO_URL: 传输目标的URL，例如 /home/user/data or s3://bucket/prefix  
174 |   
175 | Usage:  
176 |   s3trans FROM_URL TO_URL [flags]  
177 |   
178 | ```shell
179 | Flags:  
180 |       --acl string                目标S3桶的ACL，private表示只有对象所有者可以读写，例如 private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control ，不设置则默认根据S3的默认设置，通常是 private 模式 
181 |       --from-endpoint string      数据源的 API Endpoint 例如 https://storage.googleapis.com; https://oss-shenzhen.aliyuncs.com; https://cos.<region>.myqcloud.com 如果是AWS S3或本地路径，无需指定这个 Endpoint  
182 |       --from-profile string       数据源在~/.aws/credentials中的AWS profile，如果不指定profile则用default profile，如果没有default profile，则需指定region  
183 |       --force-path-style          设置为true时，可强制请求使用路径样式寻址，而不是域名。参考：http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html
184 |       --from-region string        数据源的区域，例如 cn-north-1. 如果未指定，但有设置 profile 则会自动找S3的所在 Region  
185 |   -h, --help                      帮助文档  
186 |       --http-timeout int          API请求超时（秒）（默认30）  
187 |   -l, --list-target               推荐使用。列出目标S3桶，传输之前先比较现有对象。因为列表方式比逐个对象请求检查是否存在更有效率，只是因为需要等待列出所有对象进行比较，然后再开始传输所以感觉启动较慢。为了缓解这个问题，此应用程序利用多线程并行List，进行快速列表；如果没有设置--list-target参数，就不List目标S3桶了，而是在传输每个对象之前，检查每个目标对象，这会消耗更多API调用，但开始更快；如果完全不希望做对比，直接覆盖，则用下面提到的--skip-compare参数，而不用--list-target了； 
188 |       --max-retries int           API请求最大重试次数（默认5）  
189 |       --no-sign-request           源桶不需要请求签名（即允许匿名）的情况  
190 |   -n, --num-workers int           NumWorkers x 1 个并发线程传输文件；NumWorkers x 2 每个文件的并发分片同时传输的线程数；NumWorkers x 4 List目标桶的并发线程数；推荐NumWorkers <= vCPU数量（默认4）  
191 |       --request-payer             源桶要求请求者支付的情况  
192 |       --resumable-threshold int   当文件大小（MB）大于此值时，使用断点续传。（默认50）  
193 |   -s, --skip-compare              跳过比较源和目标S3对象的名称和大小。直接覆盖所有对象。不列出目标也不检查目标对象是否已存在。  
194 |       --sqs-profile string        work-mode为SQS_SEND或SQS_CONSUME的场景下，为访问SQS队列使用~/.aws/credentials中的哪个AWS profile，不指定sqs profile则程序会自动从EC2 IAM Role获取权限，Region名称会从sqs-url自动提取。  
195 |       --sqs-url string            work-mode为SQS_SEND或SQS_CONSUME的场景下，指定发送或消费消息的SQS队列URL，例如 https://sqs.us-east-1.amazonaws.com/my_account/my_queue_name  
196 |       --storage-class string      目标S3桶的存储类，例如 STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW 或其他S3兼容的  
197 |       --to-endpoint string        数据传输目标的端点，例如 https://storage.googleapis.com . 如果是AWS S3或本地路径，无需指定这个  
198 |       --to-profile string         数据传输目标在~/.aws/credentials中的AWS profile，如果不指定profile则用default profile，如果没有default profile，则需指定region    
199 |       --to-region string          数据传输目标的区域，例如 cn-north-1. 如果未指定，但有设置 profile 则会自动找S3的所在 Region  
200 |       --transfer-metadata         从源S3桶获取元数据并上传到目标对象。这需要每传输一个对象都通过API调用获取源文件元数据。  
201 |       --work-mode string          SQS_SEND | SQS_CONSUME | DRYRUN | HTTP_DOWNLOAD_LIST; SQS_SEND：扫描节点，表示列出源S3和目标S3进行比较，并发送传输任务消息到SQS队列；SQS_CONSUME： 工作节点，表示从SQS队列获取任务消息并从来源S3传输对象到S3; HTTP_DOWNLOAD_LIST：从一个文件中的 presign url 列表（即一个http列表）下载；  
202 |   -y, --y                         忽略等待确认，直接执行；DRYRUN是只比较源和目的桶，统计数量和Size，不传输数据  
203 | ```
204 | 
205 | ## 其他说明
206 | 
207 | ### S3 触发 SQS 的 Policy示例
208 | 
209 | 写入SQS权限："Service": "s3.amazonaws.com"
210 | 读取SQS权限：EC2 Role 或直接填 AWS Account Number
211 | 
212 | ```json
213 | {
214 |   "Version": "2008-10-17",
215 |   "Id": "__default_policy_ID",
216 |   "Statement": [
217 |     {
218 |       "Sid": "__owner_statement",
219 |       "Effect": "Allow",
220 |       "Principal": {
221 |         "AWS": "arn:aws:iam::my_account_number:root"
222 |       },
223 |       "Action": "SQS:*",
224 |       "Resource": "arn:aws:sqs:us-west-2:my_account_number:s3_migration_queque"
225 |     },
226 |     {
227 |       "Sid": "__sender_statement",
228 |       "Effect": "Allow",
229 |       "Principal": {
230 |         "Service": "s3.amazonaws.com"
231 |       },
232 |       "Action": "SQS:SendMessage",
233 |       "Resource": "arn:aws:sqs:us-west-2:my_account_number:s3_migration_queque"
234 |     },
235 |     {
236 |       "Sid": "__receiver_statement",
237 |       "Effect": "Allow",
238 |       "Principal": {
239 |         "AWS": "arn:aws:iam::my_account_number:root"
240 |       },
241 |       "Action": [
242 |         "SQS:ChangeMessageVisibility",
243 |         "SQS:DeleteMessage",
244 |         "SQS:ReceiveMessage"
245 |       ],
246 |       "Resource": "arn:aws:sqs:us-west-2:my_account_number:s3_migration_queque"
247 |     }
248 |   ]
249 | }
250 | 
251 | ```
252 | 
253 | ### 配置文件
254 | 
255 | 如果不使用上面的命令行参数，而使用配置文件，可以在程序运行目录下写一个config.yaml文件，内容如下。然后只需要运行 ./s3trans FROM_URL TO_URL 即可。
256 | 
257 | ```yaml
258 | from-profile: "your_from_profile"
259 | to-profile: "your_to_profile"
260 | from-endpoint: "your_from_endpoint"
261 | to-endpoint: "your_to_endpoint"
262 | from-region: "your_from_region"
263 | to-region: "your_to_region"
264 | storage-class: "your_storage_class"
265 | acl: "your_acl"
266 | no-sign-request: false
267 | request-payer: false
268 | db-location: "./your_download_status.db"
269 | list-target: false
270 | skip-compare: false
271 | transfer-metadata: false
272 | http-timeout: 30
273 | max-retries: 5
274 | retry-delay: 5
275 | chunk-size: 5
276 | resumable-threshold: 50
277 | num-workers: 4
278 | y: false
279 | work-mode: "your_work_mode"
280 | sqs-url: "your_sqs_url"
281 | sqs-profile: "your_sqs_profile"
282 | joblist-write-to-filepath: "your_joblist_write_to_filepath"
283 | sqs-log-to-filename: "your_sqs_log_to_filename"
284 | ignore-list-path: "your_ignore_list_path"
285 | ```
286 | 
287 | 还可以把以上配置写入环境变量。
288 | 
289 | ## License
290 |   
291 | This library is licensed under the MIT-0 License. See the LICENSE file.
292 |   
293 |   ******
294 |   Author: Huang, Zhuobin (James)
295 |   ******
296 |   


--------------------------------------------------------------------------------
/check.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"path"
  8 | 	"sync"
  9 | 
 10 | 	"github.com/aws/aws-sdk-go/aws"
 11 | 	"github.com/aws/aws-sdk-go/aws/awserr"
 12 | 	"github.com/aws/aws-sdk-go/service/s3"
 13 | 	"golang.org/x/sync/semaphore"
 14 | )
 15 | 
 16 | func checkPartnumberList(svc *s3.S3, bucket, key, uploadId string) ([]PartInfo, error) {
 17 | 	var partNumbers []PartInfo
 18 | 	var partNumbersPrint []int64
 19 | 	err := svc.ListPartsPages(&s3.ListPartsInput{
 20 | 		Bucket:   aws.String(bucket),
 21 | 		Key:      aws.String(key),
 22 | 		UploadId: aws.String(uploadId),
 23 | 	}, func(page *s3.ListPartsOutput, lastPage bool) bool {
 24 | 		for _, part := range page.Parts {
 25 | 			partNumbers = append(partNumbers, PartInfo{
 26 | 				PartNumber: *part.PartNumber,
 27 | 				Etag:       *part.ETag,
 28 | 			})
 29 | 			partNumbersPrint = append(partNumbersPrint, *part.PartNumber)
 30 | 		}
 31 | 		return !lastPage
 32 | 	})
 33 | 	log.Printf("   Exist %d parts on s3://%s :%v\n", len(partNumbersPrint), path.Join(bucket, key), partNumbersPrint)
 34 | 	return partNumbers, err
 35 | }
 36 | 
 37 | func checkFileExistHead(svc *s3.S3, tofileInfo FileInfo, multipartUploadsList []*s3.MultipartUpload) (string, error) {
 38 | 	exist, err := headFile(svc, tofileInfo)
 39 | 	if err != nil {
 40 | 		return "", err
 41 | 	}
 42 | 	if exist {
 43 | 		return "NEXT", nil
 44 | 	}
 45 | 
 46 | 	// 找不到文件，或文件不一致，且要重新传的，查是否有MultipartUpload ID
 47 | 	uploadId, err := checkMultipartUploadId(tofileInfo, multipartUploadsList)
 48 | 
 49 | 	return uploadId, err
 50 | }
 51 | 
 52 | func compareMetaStructs(meta1, meta2 MetaStruct) bool {
 53 | 	if aws.StringValue(meta1.ContentType) != aws.StringValue(meta2.ContentType) ||
 54 | 		aws.StringValue(meta1.ContentLanguage) != aws.StringValue(meta2.ContentLanguage) ||
 55 | 		aws.StringValue(meta1.ContentEncoding) != aws.StringValue(meta2.ContentEncoding) ||
 56 | 		aws.StringValue(meta1.CacheControl) != aws.StringValue(meta2.CacheControl) ||
 57 | 		aws.StringValue(meta1.ContentDisposition) != aws.StringValue(meta2.ContentDisposition) {
 58 | 		return false
 59 | 	}
 60 | 	if len(meta1.Metadata) != len(meta2.Metadata) {
 61 | 		return false
 62 | 	}
 63 | 	for k, v := range meta1.Metadata {
 64 | 		if v2, ok := meta2.Metadata[k]; !ok || aws.StringValue(v) != aws.StringValue(v2) {
 65 | 			return false
 66 | 		}
 67 | 	}
 68 | 	return true
 69 | }
 70 | 
 71 | func headFile(svc *s3.S3, tofileInfo FileInfo) (bool, error) {
 72 | 	log.Printf("   Call HEAD to compare target s3://%s\n", path.Join(tofileInfo.ToBucket, tofileInfo.ToKey))
 73 | 	input := &s3.HeadObjectInput{
 74 | 		Bucket: aws.String(tofileInfo.ToBucket),
 75 | 		Key:    aws.String(tofileInfo.ToKey),
 76 | 	}
 77 | 	result, err := svc.HeadObject(input)
 78 | 	// If Not Exist
 79 | 	if err != nil {
 80 | 		if aerr, ok := err.(awserr.RequestFailure); ok {
 81 | 			if aerr.StatusCode() == 404 {
 82 | 				return false, nil
 83 | 			}
 84 | 		}
 85 | 		return false, err
 86 | 	}
 87 | 	// If Exist check size
 88 | 	if *result.ContentLength == tofileInfo.Size {
 89 | 		// If Exist and need to check metadata
 90 | 		if cfg.TransferMetadata {
 91 | 			log.Printf("   Comparing metadata of target s3://%s\n", path.Join(tofileInfo.ToBucket, tofileInfo.ToKey))
 92 | 			resultStruct := MetaStruct{
 93 | 				Metadata:           result.Metadata,
 94 | 				ContentType:        result.ContentType,
 95 | 				ContentLanguage:    result.ContentLanguage,
 96 | 				ContentEncoding:    result.ContentEncoding,
 97 | 				CacheControl:       result.CacheControl,
 98 | 				ContentDisposition: result.ContentDisposition,
 99 | 			}
100 | 
101 | 			if !compareMetaStructs(resultStruct, tofileInfo.Others) {
102 | 				log.Printf("...Metadata not match, upload target s3://%s\n", path.Join(tofileInfo.ToBucket, tofileInfo.ToKey))
103 | 				return false, nil
104 | 			}
105 | 
106 | 		}
107 | 		return true, nil
108 | 	}
109 | 	return false, nil
110 | }
111 | 
112 | func checkFileExistList(tofileInfo FileInfo, targetObjectList []*s3.Object, multipartUploadsList []*s3.MultipartUpload) (string, error) {
113 | 	for _, f := range targetObjectList {
114 | 		if *f.Key == tofileInfo.ToKey && *f.Size == tofileInfo.Size {
115 | 			return "NEXT", nil // 文件完全相同
116 | 		}
117 | 	}
118 | 
119 | 	// 找不到文件，或文件不一致，且要重新传的，查是否有MultipartUpload ID
120 | 	uploadId, err := checkMultipartUploadId(tofileInfo, multipartUploadsList)
121 | 	return uploadId, err
122 | }
123 | 
124 | func checkMultipartUploadId(tofileInfo FileInfo, multipartUploadsList []*s3.MultipartUpload) (string, error) {
125 | 	if tofileInfo.Size < cfg.ResumableThreshold {
126 | 		return "", nil // 文件小于ResumableThreshold，不需要分片
127 | 	}
128 | 	// 查所有相同Key的ID给keyIDList
129 | 	var keyIDList []*s3.MultipartUpload
130 | 	for _, u := range multipartUploadsList {
131 | 		if *u.Key == tofileInfo.ToKey {
132 | 			keyIDList = append(keyIDList, u)
133 | 		}
134 | 	}
135 | 
136 | 	// 如果找不到上传过的MultipartUpload，则从头开始传
137 | 	if len(keyIDList) == 0 {
138 | 		return "", nil
139 | 	}
140 | 
141 | 	// 对同一个Key的不同MultipartUpload ID排序找出时间最晚的值
142 | 	var latestUpload *s3.MultipartUpload
143 | 	for _, u := range keyIDList {
144 | 		if latestUpload == nil || u.Initiated.After(*latestUpload.Initiated) {
145 | 			latestUpload = u
146 | 		}
147 | 	}
148 | 
149 | 	return *latestUpload.UploadId, nil
150 | }
151 | 
152 | func getUploadId(svc *s3.S3, fileInfo FileInfo, multipartUploadsList []*s3.MultipartUpload, targetObjectList []*s3.Object) (string, error) {
153 | 	var uploadId string
154 | 	var err error
155 | 	if !cfg.SkipCompare { // 设置不做Compare了就不对比目的对象，直接覆盖
156 | 		if cfg.TransferMetadata || !cfg.ListTarget { // 要传metadata就必须用Head方式去获取对比；不ListTarget也是逐个Head去对比
157 | 			uploadId, err = checkFileExistHead(svc, fileInfo, multipartUploadsList)
158 | 			if err != nil {
159 | 				log.Printf("failed to checkFileExistHead, %v", err)
160 | 				return "", err
161 | 			}
162 | 		} else if cfg.ListTarget && !cfg.TransferMetadata { // 不要metadata就用list方式去获取对比(如果设置了ListTraget True)
163 | 			uploadId, err = checkFileExistList(fileInfo, targetObjectList, multipartUploadsList)
164 | 			if err != nil {
165 | 				log.Printf("failed to checkFileExistList, %v", err)
166 | 				return "", err
167 | 			}
168 | 		}
169 | 	}
170 | 	return uploadId, nil
171 | }
172 | 
173 | func getMultipartUploadList(svc *s3.S3, bucket string, prefix string) ([]*s3.MultipartUpload, error) {
174 | 	// log.Printf("Listing multipart uploads ID in target s3://%s\n", path.Join(bucket, prefix))
175 | 	var uploads []*s3.MultipartUpload
176 | 	err := svc.ListMultipartUploadsPages(&s3.ListMultipartUploadsInput{
177 | 		Bucket: aws.String(bucket),
178 | 		Prefix: aws.String(prefix),
179 | 	}, func(page *s3.ListMultipartUploadsOutput, lastPage bool) bool {
180 | 		uploads = append(uploads, page.Uploads...)
181 | 		return true // return false to stop pagination
182 | 	})
183 | 
184 | 	if err != nil {
185 | 		return nil, err
186 | 	}
187 | 	log.Printf("There are %d multipart uploads ID already in target s3://%s\n", len(uploads), path.Join(bucket, prefix))
188 | 
189 | 	return uploads, nil
190 | }
191 | 
192 | func getS3ObjectList(b BInfo) ([]*s3.Object, error) {
193 | 	log.Printf("Listing s3://%s\n", path.Join(b.bucket, b.prefix))
194 | 	var s3Objects []*s3.Object
195 | 	var mu sync.Mutex
196 | 	var wg sync.WaitGroup
197 | 	var sem = semaphore.NewWeighted(int64(cfg.NumWorkers * 4))
198 | 
199 | 	concurrencyListObjects(b.svc, b.bucket, b.prefix, sem, &s3Objects, &mu, &wg)
200 | 	wg.Wait()
201 | 	log.Printf("There are %d objects already in s3://%s\n", len(s3Objects), path.Join(b.bucket, b.prefix))
202 | 
203 | 	return s3Objects, nil
204 | }
205 | 
206 | func concurrencyListObjects(svc *s3.S3, bucket, prefix string, sem *semaphore.Weighted,
207 | 	s3Objects *[]*s3.Object, mu *sync.Mutex, wg *sync.WaitGroup) {
208 | 	params := &s3.ListObjectsV2Input{
209 | 		Bucket:    aws.String(bucket),
210 | 		Prefix:    aws.String(prefix),
211 | 		Delimiter: aws.String("/"),
212 | 	}
213 | 
214 | 	err := svc.ListObjectsV2Pages(params,
215 | 		func(page *s3.ListObjectsV2Output, lastPage bool) bool {
216 | 			mu.Lock()
217 | 			*s3Objects = append(*s3Objects, page.Contents...)
218 | 			mu.Unlock()
219 | 
220 | 			for _, commonPrefix := range page.CommonPrefixes {
221 | 				wg.Add(1)
222 | 				go func(p string) {
223 | 					defer sem.Release(1)
224 | 					defer wg.Done()
225 | 					sem.Acquire(context.Background(), 1)                           // 要放go func里面，因为上级线程需要继续运行下去
226 | 					concurrencyListObjects(svc, bucket, p, sem, s3Objects, mu, wg) //每个Prefix递归并发新线程
227 | 				}(*commonPrefix.Prefix)
228 | 			}
229 | 			return !lastPage
230 | 		})
231 | 	if err != nil {
232 | 		fmt.Printf("Error listing s3 objects: %v", err)
233 | 	}
234 | }
235 | 
236 | func getMetadata(b BInfo, fileInfo *FileInfo) error {
237 | 	log.Printf("-->Get metadata s3://%s\n", path.Join(fileInfo.FromBucket, fileInfo.FromKey))
238 | 	headResp, err := b.svc.HeadObject(&s3.HeadObjectInput{
239 | 		Bucket: aws.String(fileInfo.FromBucket),
240 | 		Key:    aws.String(fileInfo.FromKey),
241 | 	})
242 | 	if err != nil {
243 | 		log.Printf("failed to get object metadata, %v", err)
244 | 	}
245 | 
246 | 	fileInfo.Others = MetaStruct{
247 | 		Metadata:           headResp.Metadata,
248 | 		ContentType:        headResp.ContentType,
249 | 		ContentLanguage:    headResp.ContentLanguage,
250 | 		ContentEncoding:    headResp.ContentEncoding,
251 | 		CacheControl:       headResp.CacheControl,
252 | 		ContentDisposition: headResp.ContentDisposition,
253 | 	}
254 | 	return nil
255 | }
256 | 


--------------------------------------------------------------------------------
/download.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"log"
  8 | 	"os"
  9 | 	"path"
 10 | 	"path/filepath"
 11 | 	"strings"
 12 | 	"sync"
 13 | 	"sync/atomic"
 14 | 
 15 | 	"github.com/aws/aws-sdk-go/aws"
 16 | 	"github.com/aws/aws-sdk-go/service/s3"
 17 | 	"golang.org/x/sync/semaphore"
 18 | )
 19 | 
 20 | func startDownload(from, to BInfo) error {
 21 | 	var wg sync.WaitGroup
 22 | 	var err error
 23 | 	semFile := semaphore.NewWeighted(int64(cfg.NumWorkers))     // 并发量为NumWorkers的信号量 for file
 24 | 	semPart := semaphore.NewWeighted(int64(cfg.NumWorkers * 4)) // 并发量为NumWorkers的信号量 for parts
 25 | 	ignoreList := getIgnoreList()
 26 | 
 27 | 	err = from.svc.ListObjectsV2Pages(&s3.ListObjectsV2Input{
 28 | 		Bucket: aws.String(from.bucket),
 29 | 		Prefix: aws.String(from.prefix),
 30 | 	}, func(page *s3.ListObjectsV2Output, lastPage bool) bool {
 31 | 		for _, item := range page.Contents {
 32 | 			// Skip if the object is a directory
 33 | 			if strings.HasSuffix(*item.Key, "/") {
 34 | 				log.Println("...Skiping directory", *item.Key)
 35 | 				continue
 36 | 			}
 37 | 			// Skip if key in ignoreList
 38 | 			if isIgnored(*item.Key, ignoreList) {
 39 | 				log.Println("...Skiping ignored key in ignoreList", *item.Key)
 40 | 			}
 41 | 
 42 | 			var combinedKey string
 43 | 			if *item.Key != from.prefix {
 44 | 				// 只带上Prefix以内的目录结构
 45 | 				combinedKey = strings.TrimPrefix(*item.Key, from.prefix)
 46 | 			} else {
 47 | 				// 单个文件的时候存在*item.Key == from.prefix的情况
 48 | 				combinedKey = filepath.Base(*item.Key)
 49 | 			}
 50 | 
 51 | 			localPath := filepath.Join(to.url, combinedKey)
 52 | 			// Check if file already exists and is the same size
 53 | 			info, err := os.Stat(localPath)
 54 | 			if !cfg.SkipCompare {
 55 | 				if err == nil && info.Size() == *item.Size {
 56 | 					log.Println("...File exists and same size, skipping", localPath)
 57 | 					continue
 58 | 				} else if err != nil && !os.IsNotExist(err) {
 59 | 					log.Println("Failed to stat file", localPath, err)
 60 | 					continue
 61 | 				}
 62 | 			}
 63 | 
 64 | 			// Create necessary directories
 65 | 			thisdir := filepath.Dir(localPath)
 66 | 			if err := os.MkdirAll(thisdir, 0755); err != nil {
 67 | 				log.Println("Failed to create directories:", localPath, err)
 68 | 				continue
 69 | 			}
 70 | 
 71 | 			semFile.Acquire(context.Background(), 1) //从线程信号池中获取，没有线程可用了就阻塞等待
 72 | 			atomic.AddInt32(&runningGoroutines, 1)   //线程计数
 73 | 			wg.Add(1)
 74 | 			go func(item *s3.Object) {
 75 | 				defer wg.Done()
 76 | 				defer semFile.Release(1)
 77 | 				defer atomic.AddInt32(&runningGoroutines, -1)
 78 | 
 79 | 				// 小文件
 80 | 				if *item.Size < cfg.ResumableThreshold {
 81 | 					log.Println("   Start to download (<ResumableThreshold):", localPath, "runningGoroutines:", runningGoroutines)
 82 | 					file, err := os.OpenFile(localPath, os.O_CREATE|os.O_WRONLY, 0644)
 83 | 					if err != nil {
 84 | 						log.Println("Failed to create file", localPath, err)
 85 | 						return
 86 | 					}
 87 | 					defer file.Close()
 88 | 
 89 | 					// Use s3manager for small files 不做断点续传
 90 | 					// s3manager.Downloader 实例会并发地下载 S3 对象的多个部分，每个部分都通过一个单独的 goroutine 下载
 91 | 					input := &s3.GetObjectInput{
 92 | 						Bucket: aws.String(from.bucket),
 93 | 						Key:    aws.String(*item.Key),
 94 | 					}
 95 | 					if from.requestPayer {
 96 | 						input.RequestPayer = aws.String("requester")
 97 | 					}
 98 | 					_, err = from.downloader.Download(file, input)
 99 | 					if err != nil {
100 | 						log.Println("Failed to download file", localPath, err)
101 | 						return
102 | 					}
103 | 				} else {
104 | 					// Use multipart resumable download for large files
105 | 					log.Println("   Start to download (>=ResumableThreshold):", localPath, "runningGoroutines:", runningGoroutines)
106 | 					multipart_download_finished := false
107 | 					file, err := os.OpenFile(localPath+".s3tmp", os.O_CREATE|os.O_WRONLY, 0644)
108 | 					if err != nil {
109 | 						log.Println("Failed to create s3tmp file", localPath, err)
110 | 						return
111 | 					}
112 | 					defer func() {
113 | 						file.Close() // 确保在 file close之后再执行rename
114 | 						if multipart_download_finished {
115 | 							// 检查文件是否存在, 如果文件存在，重命名为 localPath
116 | 							if _, err := os.Stat(localPath + ".s3tmp"); err == nil {
117 | 								//
118 | 								if err := os.Rename(localPath+".s3tmp", localPath); err != nil {
119 | 									log.Println(err, localPath)
120 | 								}
121 | 							} else if !os.IsNotExist(err) {
122 | 								log.Println(err, localPath)
123 | 							} // 如果文件不存在，跳过
124 | 						}
125 | 					}()
126 | 
127 | 					fileInfo := FileInfo{
128 | 						FromKey:    *item.Key,
129 | 						FromBucket: from.bucket,
130 | 						Size:       *item.Size,
131 | 						File:       file,
132 | 					}
133 | 					indexList, chunkSizeAuto := split(fileInfo, cfg.ChunkSize)
134 | 					partnumberList, _ := getDownloadedParts(fileInfo)
135 | 					if len(partnumberList) != 0 {
136 | 						log.Printf("Exist %d/%d parts on local path: %s, %v\n", len(partnumberList), len(indexList), localPath+".s3tmp", partnumberList)
137 | 					}
138 | 					var wg2 sync.WaitGroup
139 | 					for i, offset := range indexList {
140 | 						if !contains(partnumberList, i+1) {
141 | 							size := chunkSizeAuto
142 | 							if offset+chunkSizeAuto > fileInfo.Size {
143 | 								size = fileInfo.Size - offset
144 | 							}
145 | 							partInfo := PartInfo{
146 | 								FromKey:    fileInfo.FromKey,
147 | 								FromBucket: fileInfo.FromBucket,
148 | 								PartNumber: int64(i + 1),
149 | 								Size:       size,
150 | 								Offset:     offset,
151 | 								TotalParts: int64(len(indexList)),
152 | 							}
153 | 
154 | 							semPart.Acquire(context.Background(), 1) //从线程池中获取，没有线程可用了就阻塞等待
155 | 							atomic.AddInt32(&runningGoroutines, 1)   //线程计数
156 | 							wg2.Add(1)
157 | 							go downloadPart(from.svc, partInfo, fileInfo.File, &wg2, semPart)
158 | 						}
159 | 					}
160 | 					// Clean up download part records, statstic counts
161 | 					wg2.Wait()
162 | 					deleteDownloadParts(fileInfo)
163 | 					multipart_download_finished = true
164 | 				}
165 | 				log.Println("***Successfully downloaded:", localPath)
166 | 				atomic.AddInt64(&objectCount, 1)
167 | 				atomic.AddInt64(&sizeCount, *item.Size)
168 | 			}(item)
169 | 		}
170 | 		return true
171 | 	})
172 | 	if err != nil {
173 | 		log.Println("Failed to list objects", err)
174 | 		return err
175 | 	}
176 | 	wg.Wait()
177 | 	return err
178 | }
179 | 
180 | func downloadPartAction(svc *s3.S3, partInfo PartInfo) ([]byte, error) {
181 | 	log.Printf("-->Downloading part s3://%s %d/%d, runningGoroutines: %d\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts, runningGoroutines)
182 | 	input := &s3.GetObjectInput{
183 | 		Bucket: &partInfo.FromBucket,
184 | 		Key:    &partInfo.FromKey,
185 | 		Range:  aws.String(fmt.Sprintf("bytes=%d-%d", partInfo.Offset, partInfo.Offset+partInfo.Size-1)),
186 | 	}
187 | 	if from.requestPayer {
188 | 		input.RequestPayer = aws.String("requester")
189 | 	}
190 | 	resp, err := svc.GetObject(input)
191 | 	if err != nil {
192 | 		log.Println("Failed to download part", partInfo.FromBucket, partInfo.FromKey, partInfo.PartNumber, err)
193 | 		return nil, err
194 | 	}
195 | 	defer resp.Body.Close()
196 | 	buffer, err := io.ReadAll(resp.Body)
197 | 	if err != nil {
198 | 		log.Println("Failed to read from response body:", partInfo.FromBucket, partInfo.FromKey, partInfo.PartNumber, err)
199 | 		return nil, err
200 | 	}
201 | 	return buffer, nil
202 | }
203 | 
204 | func downloadPart(svc *s3.S3, partInfo PartInfo, file *os.File, wg *sync.WaitGroup, semPart *semaphore.Weighted) error {
205 | 	defer wg.Done()
206 | 	defer semPart.Release(1)
207 | 	defer atomic.AddInt32(&runningGoroutines, -1)
208 | 
209 | 	// Download part S3 API Call
210 | 	buffer, err := downloadPartAction(svc, partInfo)
211 | 	if err != nil {
212 | 		return err
213 | 	}
214 | 	// Write the part to file
215 | 	if _, err := file.WriteAt(buffer, partInfo.Offset); err != nil {
216 | 		log.Println("Failed to write to file", partInfo.FromBucket, partInfo.FromKey, partInfo.PartNumber, err)
217 | 		return err
218 | 	}
219 | 
220 | 	// Record the download part
221 | 	recordDownloadPart(partInfo)
222 | 	log.Printf("===Downloaded part s3://%s part:%d/%d\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts)
223 | 	return nil
224 | }
225 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module s3trans
 2 | 
 3 | go 1.22.7
 4 | 
 5 | require (
 6 | 	github.com/aws/aws-sdk-go v1.55.5
 7 | 	github.com/google/uuid v1.6.0
 8 | 	github.com/mattn/go-sqlite3 v1.14.24
 9 | 	github.com/spf13/cobra v1.8.1
10 | 	github.com/spf13/viper v1.19.0
11 | 	golang.org/x/sync v0.10.0
12 | )
13 | 
14 | require (
15 | 	github.com/fsnotify/fsnotify v1.7.0 // indirect
16 | 	github.com/hashicorp/hcl v1.0.0 // indirect
17 | 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
18 | 	github.com/jmespath/go-jmespath v0.4.0 // indirect
19 | 	github.com/magiconair/properties v1.8.7 // indirect
20 | 	github.com/mitchellh/mapstructure v1.5.0 // indirect
21 | 	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
22 | 	github.com/sagikazarmark/locafero v0.4.0 // indirect
23 | 	github.com/sagikazarmark/slog-shim v0.1.0 // indirect
24 | 	github.com/sourcegraph/conc v0.3.0 // indirect
25 | 	github.com/spf13/afero v1.11.0 // indirect
26 | 	github.com/spf13/cast v1.6.0 // indirect
27 | 	github.com/spf13/pflag v1.0.5 // indirect
28 | 	github.com/subosito/gotenv v1.6.0 // indirect
29 | 	go.uber.org/atomic v1.9.0 // indirect
30 | 	go.uber.org/multierr v1.9.0 // indirect
31 | 	golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
32 | 	golang.org/x/sys v0.18.0 // indirect
33 | 	golang.org/x/text v0.14.0 // indirect
34 | 	gopkg.in/ini.v1 v1.67.0 // indirect
35 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
36 | )
37 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
 2 | github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
 3 | github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 5 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 6 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 7 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 8 | github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
 9 | github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
10 | github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
11 | github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
12 | github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
13 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
14 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
15 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
16 | github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
17 | github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
18 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
19 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
20 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
21 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
22 | github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
23 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
24 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
25 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
26 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
27 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
28 | github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY=
29 | github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
30 | github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM=
31 | github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
32 | github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
33 | github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
34 | github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM=
35 | github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
36 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
37 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
38 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
39 | github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
40 | github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
41 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
42 | github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ=
43 | github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4=
44 | github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE=
45 | github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ=
46 | github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo=
47 | github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0=
48 | github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8=
49 | github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY=
50 | github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0=
51 | github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
52 | github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
53 | github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
54 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
55 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
56 | github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI=
57 | github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg=
58 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
59 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
60 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
61 | github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
62 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
63 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
64 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
65 | github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
66 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
67 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
68 | github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
69 | github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
70 | go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE=
71 | go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
72 | go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI=
73 | go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ=
74 | golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g=
75 | golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k=
76 | golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
77 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
78 | golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
79 | golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
80 | golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
81 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
82 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
83 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
84 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
85 | gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
86 | gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
87 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
88 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
89 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
90 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
91 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
92 | 


--------------------------------------------------------------------------------
/http_download.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"context"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"log"
  9 | 	"net/http"
 10 | 	"net/url"
 11 | 	"os"
 12 | 	"path"
 13 | 	"path/filepath"
 14 | 	"strconv"
 15 | 	"strings"
 16 | 	"sync"
 17 | 	"sync/atomic"
 18 | 	"time"
 19 | 
 20 | 	"golang.org/x/sync/semaphore"
 21 | )
 22 | 
 23 | func startHttpDownload(from, to BInfo) error {
 24 | 	var wg sync.WaitGroup
 25 | 	semFile := semaphore.NewWeighted(int64(cfg.NumWorkers)) // 并发量为NumWorkers的信号量 for file
 26 | 	var httpList []string
 27 | 
 28 | 	switch cfg.WorkMode {
 29 | 	case "HTTP_DOWNLOAD":
 30 | 		httpList = append(httpList, from.url)
 31 | 	case "HTTP_DOWNLOAD_LIST":
 32 | 		// Read localfile of presign url lines as list from from.url
 33 | 		file, err := os.Open(from.url)
 34 | 		if err != nil {
 35 | 			log.Println("Failed to open file of HTTP_DOWNLOAD_LIST", err)
 36 | 			return err
 37 | 		}
 38 | 		defer file.Close()
 39 | 
 40 | 		scanner := bufio.NewScanner(file)
 41 | 		for scanner.Scan() {
 42 | 			httpList = append(httpList, scanner.Text())
 43 | 		}
 44 | 		log.Println("Read local file of HTTP_DOWNLOAD_LIST, total:", len(httpList))
 45 | 	}
 46 | 
 47 | 	for _, thisUrl := range httpList {
 48 | 		semFile.Acquire(context.Background(), 1) //从线程信号池中获取，没有线程可用了就阻塞等待
 49 | 		atomic.AddInt32(&runningGoroutines, 1)   //线程计数
 50 | 		wg.Add(1)
 51 | 		go downloadHTTPFile(thisUrl, &wg, semFile)
 52 | 	}
 53 | 	wg.Wait()
 54 | 	return nil
 55 | }
 56 | 
 57 | func downloadHTTPFile(thisUrl string, wg *sync.WaitGroup, semFile *semaphore.Weighted) error {
 58 | 	defer wg.Done()
 59 | 	defer semFile.Release(1)
 60 | 	defer atomic.AddInt32(&runningGoroutines, -1)
 61 | 
 62 | 	// Download each object
 63 | 	URL, err := url.Parse(thisUrl)
 64 | 	if err != nil {
 65 | 		log.Printf("Invalid HTTP URL: %s, %v\n", thisUrl, err)
 66 | 		return err
 67 | 	}
 68 | 	from.bucket = strings.Split(URL.Host, ".")[0]
 69 | 	fullPrefix := strings.TrimSuffix(strings.TrimPrefix(URL.Path, "/"), "/")
 70 | 	fileName := filepath.Base(fullPrefix)
 71 | 	localPath := filepath.Join(to.url, fileName)
 72 | 
 73 | 	// Get the file size
 74 | 	fileSize, err := getHTTPFileSize(thisUrl)
 75 | 	if err != nil {
 76 | 		log.Println("Failed to get file size:", thisUrl, err)
 77 | 		return err
 78 | 	}
 79 | 
 80 | 	// Check if file already exists and is the same size
 81 | 	info, err := os.Stat(localPath)
 82 | 	if !cfg.SkipCompare {
 83 | 		if err == nil && info.Size() == fileSize {
 84 | 			log.Println("...File exists and same size, skipping", localPath)
 85 | 			return nil
 86 | 		} else if err != nil && !os.IsNotExist(err) {
 87 | 			log.Println("Failed to stat file", localPath, err)
 88 | 			return err
 89 | 		}
 90 | 	}
 91 | 
 92 | 	log.Println("   Start to https download:", localPath)
 93 | 	multipart_download_finished := false
 94 | 
 95 | 	// Create necessary directories
 96 | 	thisdir := filepath.Dir(localPath)
 97 | 	if err := os.MkdirAll(thisdir, 0755); err != nil {
 98 | 		log.Println("Failed to create directories:", localPath, err)
 99 | 		return err
100 | 	}
101 | 
102 | 	file, err := os.OpenFile(localPath+".s3tmp", os.O_CREATE|os.O_WRONLY, 0644)
103 | 	if err != nil {
104 | 		log.Println("Failed to create s3tmp file:", localPath, err)
105 | 		return err
106 | 	}
107 | 	defer func() {
108 | 		file.Close() // 确保在 file close之后再执行rename
109 | 		if multipart_download_finished {
110 | 			// 检查文件是否存在, 如果文件存在，重命名为 localPath
111 | 			if _, err := os.Stat(localPath + ".s3tmp"); err == nil {
112 | 				//
113 | 				if err := os.Rename(localPath+".s3tmp", localPath); err != nil {
114 | 					log.Println(err, localPath)
115 | 				}
116 | 			} else if !os.IsNotExist(err) {
117 | 				log.Println(err, localPath)
118 | 			} // 如果文件不存在，跳过
119 | 		}
120 | 	}()
121 | 
122 | 	// list parts numbers
123 | 	fileInfo := FileInfo{
124 | 		FromKey:    fullPrefix,
125 | 		FromBucket: from.bucket,
126 | 		Size:       fileSize,
127 | 		File:       file,
128 | 	}
129 | 	indexList, chunkSizeAuto := split(fileInfo, cfg.ChunkSize)
130 | 	partnumberList, _ := getDownloadedParts(fileInfo)
131 | 	if len(partnumberList) != 0 {
132 | 		log.Printf("Exist %d/%d parts on local path: %s, %v\n", len(partnumberList), len(indexList), localPath+".s3tmp", partnumberList)
133 | 	}
134 | 
135 | 	// Follow indexList to download parts
136 | 	var wg2 sync.WaitGroup
137 | 	semPart := semaphore.NewWeighted(int64(cfg.NumWorkers * 4)) // 并发量为NumWorkers的信号量 for parts
138 | 
139 | 	for i, offset := range indexList {
140 | 		if !contains(partnumberList, i+1) {
141 | 			size := chunkSizeAuto
142 | 			if offset+chunkSizeAuto > fileInfo.Size {
143 | 				size = fileInfo.Size - offset
144 | 			}
145 | 			partInfo := PartInfo{
146 | 				FromBucket: from.bucket,
147 | 				FromKey:    fullPrefix,
148 | 				URL:        thisUrl,
149 | 				PartNumber: int64(i + 1),
150 | 				Size:       size,
151 | 				Offset:     offset,
152 | 				TotalParts: int64(len(indexList)),
153 | 			}
154 | 
155 | 			semPart.Acquire(context.Background(), 1) //从线程池中获取，没有线程可用了就阻塞等待
156 | 			atomic.AddInt32(&runningGoroutines, 1)   //线程计数
157 | 			wg2.Add(1)
158 | 			go downloadHttpChunk(partInfo, fileInfo.File, &wg2, semPart)
159 | 		}
160 | 	}
161 | 	// Clean up download part records, statstic counts
162 | 	wg2.Wait()
163 | 	deleteDownloadParts(fileInfo)
164 | 	multipart_download_finished = true
165 | 	log.Println("   Finish https download:", localPath)
166 | 	atomic.AddInt64(&objectCount, 1)
167 | 	atomic.AddInt64(&sizeCount, fileSize)
168 | 	return nil
169 | }
170 | 
171 | func downloadHttpChunk(partInfo PartInfo, file *os.File, wg *sync.WaitGroup, semPart *semaphore.Weighted) error {
172 | 	defer wg.Done()
173 | 	defer semPart.Release(1)
174 | 	defer atomic.AddInt32(&runningGoroutines, -1)
175 | 
176 | 	// Download part HTTP API Call
177 | 	buffer, err := getHTTPFileBody(partInfo)
178 | 	if err != nil {
179 | 		return err
180 | 	}
181 | 	// Write the part to file
182 | 	if _, err := file.WriteAt(buffer, partInfo.Offset); err != nil {
183 | 		log.Printf("Failed to write part s3://%s part:%d/%d, err: %v\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts, err)
184 | 		return err
185 | 	}
186 | 
187 | 	// Record the download part
188 | 	recordDownloadPart(partInfo)
189 | 	log.Printf("===Downloaded part s3://%s part:%d/%d\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts)
190 | 	return nil
191 | 
192 | }
193 | 
194 | func getHTTPFileBody(partInfo PartInfo) ([]byte, error) {
195 | 	log.Printf("-->Downloading part s3://%s %d/%d, runningGoroutines: %d\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts, runningGoroutines)
196 | 
197 | 	req, err := http.NewRequest("GET", partInfo.URL, nil)
198 | 	if err != nil {
199 | 		fmt.Println("Error creating request:", err)
200 | 		return nil, err
201 | 	}
202 | 
203 | 	req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", partInfo.Offset, partInfo.Offset+partInfo.Size-1))
204 | 
205 | 	retryRoundTripper := &RetryRoundTripper{
206 | 		Proxied: http.DefaultTransport,
207 | 		Retries: 3,               // Set the desired number of retries
208 | 		Delay:   time.Second * 5, // Set the desired delay between retries
209 | 	}
210 | 	client := &http.Client{
211 | 		Transport: retryRoundTripper,
212 | 	}
213 | 
214 | 	resp, err := client.Do(req)
215 | 	if err != nil {
216 | 		fmt.Println("Error downloading chunk:", err)
217 | 		return nil, err
218 | 	}
219 | 	defer resp.Body.Close()
220 | 
221 | 	buffer := make([]byte, partInfo.Size)
222 | 	_, err = io.ReadFull(resp.Body, buffer)
223 | 	if err != nil {
224 | 		fmt.Println("Error reading chunk:", err)
225 | 		return nil, err
226 | 	}
227 | 
228 | 	return buffer, nil
229 | }
230 | 
231 | func getHTTPFileSize(thisUrl string) (int64, error) {
232 | 	req, err := http.NewRequest("GET", thisUrl, nil)
233 | 	if err != nil {
234 | 		log.Println("Failed to create request for:", thisUrl, err)
235 | 		return 0, err
236 | 	}
237 | 	req.Header.Set("Range", "bytes=0-0")
238 | 
239 | 	retryRoundTripper := &RetryRoundTripper{
240 | 		Proxied: http.DefaultTransport,
241 | 		Retries: 3,               // Set the desired number of retries
242 | 		Delay:   time.Second * 5, // Set the desired delay between retries
243 | 	}
244 | 	client := &http.Client{
245 | 		Transport: retryRoundTripper,
246 | 	}
247 | 
248 | 	resp, err := client.Do(req)
249 | 	if err != nil {
250 | 		log.Println("Failed to GET file size for:", thisUrl, err)
251 | 		return 0, err
252 | 	}
253 | 	defer resp.Body.Close()
254 | 	if resp.StatusCode != http.StatusPartialContent {
255 | 		log.Println("unexpected status code while GET file size:", resp.StatusCode, thisUrl)
256 | 		return 0, err
257 | 	}
258 | 	fileSizeStr := resp.Header.Get("Content-Range")
259 | 	if fileSizeStr == "" {
260 | 		log.Println("missing Content-Range header while GET file size for:", thisUrl)
261 | 		return 0, err
262 | 	}
263 | 	parts := strings.Split(fileSizeStr, "/")
264 | 	if len(parts) != 2 {
265 | 		log.Println("invalid Content-Range header format while GET file size for:", thisUrl)
266 | 		return 0, err
267 | 	}
268 | 
269 | 	fileSize, err := strconv.ParseInt(parts[1], 10, 64)
270 | 	if err != nil {
271 | 		log.Println("Failed to parse file size for:", thisUrl, err)
272 | 		return 0, err
273 | 	}
274 | 	return fileSize, nil
275 | }
276 | 


--------------------------------------------------------------------------------
/img/arch-cn1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-s3-resumable-upload/ccc06cc59b4bac7aa092450099c5bbc29f022914/img/arch-cn1.png


--------------------------------------------------------------------------------
/img/arch-cn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-s3-resumable-upload/ccc06cc59b4bac7aa092450099c5bbc29f022914/img/arch-cn2.png


--------------------------------------------------------------------------------
/img/arch-en1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-s3-resumable-upload/ccc06cc59b4bac7aa092450099c5bbc29f022914/img/arch-en1.png


--------------------------------------------------------------------------------
/img/arch-en2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-s3-resumable-upload/ccc06cc59b4bac7aa092450099c5bbc29f022914/img/arch-en2.png


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
  1 | // 多线程并发断点续传上传/下载S3，支持支持Amazon S3, Ali OSS, Tencent COS, Google GCS 等兼容S3 API的对象存储
  2 | // 使用 ./s3trans -h 获取更多帮助信息
  3 | package main
  4 | 
  5 | import (
  6 | 	"fmt"
  7 | 	"log"
  8 | 	"net/http"
  9 | 	"net/url"
 10 | 	"os"
 11 | 	"strings"
 12 | 	"time"
 13 | 
 14 | 	"github.com/aws/aws-sdk-go/aws"
 15 | 	"github.com/aws/aws-sdk-go/aws/credentials"
 16 | 	"github.com/aws/aws-sdk-go/aws/session"
 17 | 	"github.com/aws/aws-sdk-go/service/s3"
 18 | 	"github.com/aws/aws-sdk-go/service/s3/s3manager"
 19 | 	"github.com/aws/aws-sdk-go/service/sqs"
 20 | 	_ "github.com/mattn/go-sqlite3" // 导入SQLite3包但不使用，只用其驱动
 21 | 	"github.com/spf13/cobra"
 22 | 	"github.com/spf13/viper"
 23 | )
 24 | 
 25 | type Config struct {
 26 | 	ListTarget         bool   `mapstructure:"list-target"`               // 一次性从目标S3获取列表进行对比再开始传输，文件数量大的情况可以节省每次请求之前逐个文件对比的API Call
 27 | 	SkipCompare        bool   `mapstructure:"skip-compare"`              // 是否不做目标S3与源文件的对比，即无论是否有重复文件，都直接开始传输并覆盖
 28 | 	TransferMetadata   bool   `mapstructure:"transfer-metadata"`         // 是否传输源S3 Object MetaData到目标S3，只在S3toS3模式下可用
 29 | 	HttpTimeout        int    `mapstructure:"http-timeout"`              // S3 http 超时时间(秒)
 30 | 	MaxRetries         int    `mapstructure:"max-retries"`               // API 请求最大重试次数
 31 | 	ResumableThreshold int64  `mapstructure:"resumable-threshold"`       // 走断点续传流程的门槛，小于该值则直接并发下载，对于文件不大或不担心中断的情况效率更高（单位MB）
 32 | 	NumWorkers         int    `mapstructure:"num-workers"`               // 控制 goroutine 总量
 33 | 	WorkMode           string `mapstructure:"work-mode"`                 // SQS_SEND | SQS_CONSUME
 34 | 	SQSUrl             string `mapstructure:"sqs-url"`                   // SQS Queue URL
 35 | 	SQSProfile         string `mapstructure:"sqs-profile"`               // SQS Queue Profile
 36 | 	YPtr               bool   `mapstructure:"y"`                         // Ignore waiting for confirming command
 37 | 	DBPath             string `mapstructure:"db-location"`               // 自动创建已经下载的分片状态记录数据库
 38 | 	ChunkSize          int64  `mapstructure:"chunk-size"`                // Multipart 分片大小
 39 | 	RetryDelay         int    `mapstructure:"retry-delay"`               // API 请求重试延迟时间(秒)
 40 | 	JobListPath        string `mapstructure:"joblist-write-to-filepath"` // 列出S3传输任务之后，写入到一个文件作为备份
 41 | 	SQSSentLogName     string `mapstructure:"sqs-log-to-filename"`       // SQS已发送消息的记录文件名
 42 | 	IgnoreListPath     string `mapstructure:"ignore-list-path"`          // List和传输的时候，如果S3源的Key或本地源路径的前缀在Ignore List里面，则跳过。设置的时候注意S3的Key是不带“/”开头的
 43 | 	ForcePathStyle     bool   `mapstructure:"force-path-style"`          // 强制使用路径方式访问S3，而不是域名方式
 44 | }
 45 | 
 46 | type BInfo struct {
 47 | 	url, bucket, prefix, profile, endpoint, region, storageClass, ACL string
 48 | 	noSignRequest                                                     bool // The bucket is noSignRequest, no need to sign
 49 | 	requestPayer                                                      bool // The bucket is requestPayer
 50 | 	sess                                                              *session.Session
 51 | 	svc                                                               *s3.S3
 52 | 	downloader                                                        *s3manager.Downloader
 53 | 	uploader                                                          *s3manager.Uploader
 54 | }
 55 | 
 56 | type MetaStruct struct {
 57 | 	Metadata                                                                        map[string]*string
 58 | 	ContentType, ContentLanguage, ContentEncoding, CacheControl, ContentDisposition *string
 59 | 	Expires                                                                         *time.Time
 60 | }
 61 | 
 62 | type FileInfo struct {
 63 | 	FromKey, FromBucket, ToKey, ToBucket string
 64 | 	Size                                 int64
 65 | 	File                                 *os.File
 66 | 	Others                               MetaStruct
 67 | }
 68 | 
 69 | type PartInfo struct {
 70 | 	FromKey, FromBucket, ToKey, ToBucket, Etag string
 71 | 	Size, Offset                               int64
 72 | 	PartNumber, TotalParts                     int64
 73 | 	URL                                        string
 74 | }
 75 | 
 76 | type RetryFunc func() error
 77 | 
 78 | var (
 79 | 	from, to               BInfo
 80 | 	objectCount, sizeCount int64
 81 | 	runningGoroutines      int32 // 当前正在运行的 goroutine 的数量
 82 | 	cfg                    Config
 83 | 	sqsSvc                 *sqs.SQS
 84 | )
 85 | 
 86 | var rootCmd = &cobra.Command{
 87 | 	Use:   "s3trans FROM_URL TO_URL",
 88 | 	Short: "s3trans transfers data from source to target",
 89 | 	Long: `s3trans transfers data from source to target.
 90 | 	./s3trans FROM_URL TO_URL [OPTIONS]
 91 | 	FROM_URL: The url of data source, e.g. /home/user/data or s3://bucket/prefix
 92 | 	TO_URL: The url of data transfer target, e.g. /home/user/data or s3://bucket/prefix
 93 | 	For example:
 94 | 	./s3trans s3://bucket/prefix s3://bucket/prefix -from_profile sin -to_profile bjs
 95 | 	./s3trans s3://bucket/prefix /home/user/data -from_profile sin 
 96 | 	`,
 97 | 	Args: cobra.ExactArgs(2), // 要求必须提供2个参数
 98 | 	Run: func(cmd *cobra.Command, args []string) {
 99 | 		// args[0] 是 FROM_URL, args[1] 是 TO_URL
100 | 		from.url = args[0]
101 | 		to.url = args[1]
102 | 	},
103 | }
104 | 
105 | func init() {
106 | 	rootCmd.SetHelpFunc(func(cmd *cobra.Command, args []string) {
107 | 		fmt.Print(cmd.Long)
108 | 		os.Exit(0)
109 | 	})
110 | 	cobra.OnInitialize(initConfig)
111 | 	rootCmd.PersistentFlags().String("from-profile", "", "The AWS profile in ~/.aws/credentials of data source")
112 | 	viper.BindPFlag("from-profile", rootCmd.PersistentFlags().Lookup("from-profile"))
113 | 	rootCmd.PersistentFlags().Bool("force-path-style", false, "Set this to `true` to force the request to use path-style addressing See http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html")
114 | 	viper.BindPFlag("force-path-style", rootCmd.PersistentFlags().Lookup("force-path-style"))
115 | 	rootCmd.PersistentFlags().String("to-profile", "", "The AWS profile in ~/.aws/credentials of data transfer target")
116 | 	viper.BindPFlag("to-profile", rootCmd.PersistentFlags().Lookup("to-profile"))
117 | 	rootCmd.PersistentFlags().String("from-endpoint", "", "The endpoint of data source, e.g. https://storage.googleapis.com; https://oss-<region>.aliyuncs.com; https://cos.<region>.myqcloud.com . If AWS s3 or local path, no need to specify this.")
118 | 	viper.BindPFlag("from-endpoint", rootCmd.PersistentFlags().Lookup("from-endpoint"))
119 | 	rootCmd.PersistentFlags().String("to-endpoint", "", "The endpoint of data transfer target, e.g. https://storage.googleapis.com . If AWS s3 or local path, no need to specify this.")
120 | 	viper.BindPFlag("to-endpoint", rootCmd.PersistentFlags().Lookup("to-endpoint"))
121 | 	rootCmd.PersistentFlags().String("from-region", "", "The region of data transfer source, e.g. cn-north-1. If no specified, the region will be auto detected with the credentials you provided in profile.")
122 | 	viper.BindPFlag("from-region", rootCmd.PersistentFlags().Lookup("from-region"))
123 | 	rootCmd.PersistentFlags().String("to-region", "", "The region of data transfer target, e.g. us-east-1. If no specified, the region will be auto detected with the credentials you provided in profile.")
124 | 	viper.BindPFlag("to-region", rootCmd.PersistentFlags().Lookup("to-region"))
125 | 	rootCmd.PersistentFlags().String("storage-class", "", "The TARGET S3 bucket storage class, e.g. STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW or others of S3 compatibale")
126 | 	viper.BindPFlag("storage-class", rootCmd.PersistentFlags().Lookup("storage-class"))
127 | 	rootCmd.PersistentFlags().String("acl", "", "The TARGET S3 bucket ACL, private means only the object owner can read&write, e.g. private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control")
128 | 	viper.BindPFlag("acl", rootCmd.PersistentFlags().Lookup("acl"))
129 | 	rootCmd.PersistentFlags().Bool("no-sign-request", false, "The SOURCE bucket is not needed to sign the request")
130 | 	viper.BindPFlag("no-sign-request", rootCmd.PersistentFlags().Lookup("no-sign-request"))
131 | 	rootCmd.PersistentFlags().Bool("request-payer", false, "The SOURCE bucket requires requester to pay, set this")
132 | 	viper.BindPFlag("request-payer", rootCmd.PersistentFlags().Lookup("request-payer"))
133 | 	rootCmd.PersistentFlags().String("db-location", "./download-status.db", "local db to record download resumable status")
134 | 	viper.BindPFlag("db-location", rootCmd.PersistentFlags().Lookup("db-location"))
135 | 
136 | 	rootCmd.PersistentFlags().BoolP("list-target", "l", false, "List the TARGET S3 bucket, compare exist objects BEFORE transfer. List is more efficient than head each object to check if it exists, but transfer may start slower because it needs to wait for listing all objects to compare. To mitigate this, this app leverage Concurrency Listing for fast list; If no list-target para, transfer without listing the target S3 bucket, but before transfering each object, head each target object to check, this costs more API call, but start faster.")
137 | 	viper.BindPFlag("list-target", rootCmd.PersistentFlags().Lookup("list-target"))
138 | 	rootCmd.PersistentFlags().BoolP("skip-compare", "s", false, "If True, skip to compare the name and size between source and target S3 object. Just overwrite all objects. No list target nor head target object to check if it already exists.")
139 | 	viper.BindPFlag("skip-compare", rootCmd.PersistentFlags().Lookup("skip-compare"))
140 | 	rootCmd.PersistentFlags().Bool("transfer-metadata", false, "If True, get metadata from source S3 bucket and upload the metadata to target object. This costs more API calls.")
141 | 	viper.BindPFlag("transfer-metadata", rootCmd.PersistentFlags().Lookup("transfer-metadata"))
142 | 
143 | 	rootCmd.PersistentFlags().Int("http-timeout", 30, "API request timeout (seconds)")
144 | 	viper.BindPFlag("http-timeout", rootCmd.PersistentFlags().Lookup("http-timeout"))
145 | 	rootCmd.PersistentFlags().Int("max-retries", 5, "API request max retries")
146 | 	viper.BindPFlag("max-retries", rootCmd.PersistentFlags().Lookup("max-retries"))
147 | 	rootCmd.PersistentFlags().Int("retry-delay", 5, "Delay before next retry in secondes")
148 | 	viper.BindPFlag("retry-delay", rootCmd.PersistentFlags().Lookup("retry-delay"))
149 | 	rootCmd.PersistentFlags().Int64("chunk-size", 5, "Multipart part size(MB)")
150 | 	viper.BindPFlag("chunk-size", rootCmd.PersistentFlags().Lookup("chunk-size"))
151 | 	rootCmd.PersistentFlags().Int64("resumable-threshold", 50, "When the file size (MB) is larger than this value, the file will be resumable transfered.")
152 | 	viper.BindPFlag("resumable-threshold", rootCmd.PersistentFlags().Lookup("resumable-threshold"))
153 | 	rootCmd.PersistentFlags().IntP("num-workers", "n", 4, "Max concurrent threads = NumWorkers*NumWorkers*4 (files*parts*4), recommend NumWorkers <= vCPU number")
154 | 	viper.BindPFlag("num-workers", rootCmd.PersistentFlags().Lookup("num-workers"))
155 | 	rootCmd.PersistentFlags().BoolP("y", "y", false, "Ignore waiting for confirming command")
156 | 	viper.BindPFlag("y", rootCmd.PersistentFlags().Lookup("y"))
157 | 
158 | 	rootCmd.PersistentFlags().String("work-mode", "", "SQS_SEND | SQS_CONSUME | DRYRUN | HTTP_DOWNLOAD_LIST; SQS_SEND means listing source FROM_URL S3 and target TO_URL S3 to compare and send message to SQS queue, SQS_CONSUME means consume message from SQS queue and transfer objects from FROM_URL S3 to TO_URL S3; DRYRUN means only count the objects and sizes comparing delta list of FROM_URL S3 and TO_URL S3, no transfer; HTTP_DOWNLOAD_LIST, from a list file with lines of presign url;")
159 | 	viper.BindPFlag("work-mode", rootCmd.PersistentFlags().Lookup("work-mode"))
160 | 	rootCmd.PersistentFlags().String("sqs-url", "", "The SQS queue URL to send or consume message from, e.g. https://sqs.us-east-1.amazonaws.com/my_account/my_queue_name")
161 | 	viper.BindPFlag("sqs-url", rootCmd.PersistentFlags().Lookup("sqs-url"))
162 | 	rootCmd.PersistentFlags().String("sqs-profile", "", "The SQS queue leverage which AWS profile in ~/.aws/credentials")
163 | 	viper.BindPFlag("sqs-profile", rootCmd.PersistentFlags().Lookup("sqs-profile"))
164 | 	rootCmd.PersistentFlags().String("joblist-write-to-filepath", "", "After listing source and target S3, compare the delta joblist and write the joblist to this filepath, e.g. ./joblist.txt")
165 | 	viper.BindPFlag("joblist-write-to-filepath", rootCmd.PersistentFlags().Lookup("joblist-write-to-filepath"))
166 | 	rootCmd.PersistentFlags().String("sqs-log-to-filename", "", "After sent joblist to SQS, write the sent messages log to this filepath, e.g. ./sqs-log.txt")
167 | 	viper.BindPFlag("sqs-log-to-filename", rootCmd.PersistentFlags().Lookup("sqs-log-to-filename"))
168 | 	rootCmd.PersistentFlags().String("ignore-list-path", "", "When listing and transfer, if source S3 key or local path matching the prefix in this ignore-list, it will be ignored. This is useful to ignore some objects that are not needed to transfer. The ignore-list is a file path, e.g. ./ignore-list.txt")
169 | 	viper.BindPFlag("ignore-list-path", rootCmd.PersistentFlags().Lookup("ignore-list-path"))
170 | }
171 | 
172 | func initConfig() {
173 | 	viper.AutomaticEnv() // read in environment variables that match
174 | 
175 | 	// If a config file is found, read it in.
176 | 	viper.SetConfigFile("config.yaml") // YAML 格式配置文件 config.yaml
177 | 	if err := viper.ReadInConfig(); err == nil {
178 | 		fmt.Println("Using config file:", viper.ConfigFileUsed())
179 | 	}
180 | 	// Unmarshal config into cfg struct
181 | 	if err := viper.Unmarshal(&cfg); err != nil {
182 | 		fmt.Println("Error unmarshalling config:", err)
183 | 		os.Exit(1)
184 | 	}
185 | }
186 | 
187 | func getConfig() {
188 | 	if err := rootCmd.Execute(); err != nil {
189 | 		fmt.Println(err)
190 | 		os.Exit(1)
191 | 	}
192 | 	from.profile = viper.GetString("from-profile")
193 | 	to.profile = viper.GetString("to-profile")
194 | 	from.endpoint = viper.GetString("from-endpoint")
195 | 	to.endpoint = viper.GetString("to-endpoint")
196 | 	from.region = viper.GetString("from-region")
197 | 	to.region = viper.GetString("to-region")
198 | 	to.storageClass = viper.GetString("storage-class")
199 | 	to.ACL = viper.GetString("acl")
200 | 	from.noSignRequest = viper.GetBool("no-sign-request")
201 | 	from.requestPayer = viper.GetBool("request-payer")
202 | 	cfg.ResumableThreshold = cfg.ResumableThreshold * 1024 * 1024
203 | 	cfg.ChunkSize = cfg.ChunkSize * 1024 * 1024
204 | 
205 | 	for i, binfo := range []*BInfo{&from, &to} {
206 | 		if i == 0 {
207 | 			fmt.Print("From ")
208 | 		} else {
209 | 			fmt.Print("To ")
210 | 		}
211 | 		if strings.HasPrefix(binfo.url, "s3://") {
212 | 			// Parse S3 URL
213 | 			URL, err := url.Parse(binfo.url)
214 | 			if err != nil {
215 | 				log.Fatalf("Invalid S3 URL: %s, %v\n", binfo.url, err)
216 | 				os.Exit(1)
217 | 			}
218 | 			binfo.bucket = URL.Host
219 | 			binfo.prefix = strings.TrimSuffix(strings.TrimPrefix(URL.Path, "/"), "/")
220 | 			binfo.sess = getSess(binfo)
221 | 			binfo.svc = s3.New(binfo.sess)
222 | 			if i == 0 {
223 | 				binfo.downloader = s3manager.NewDownloader(binfo.sess)
224 | 				binfo.downloader.Concurrency = cfg.NumWorkers * 4
225 | 				binfo.downloader.PartSize = cfg.ChunkSize
226 | 			} else {
227 | 				binfo.uploader = s3manager.NewUploader(binfo.sess)
228 | 				binfo.uploader.Concurrency = cfg.NumWorkers * 4
229 | 				binfo.uploader.PartSize = cfg.ChunkSize
230 | 			}
231 | 			fmt.Printf("Bucket: %s, Prefix: %s, Profile: %s, Endpoint-URL: %s, Region:%s\n", binfo.bucket, binfo.prefix, binfo.profile, binfo.endpoint, binfo.region)
232 | 		} else
233 | 
234 | 		// TODO: Azure Blog Storage
235 | 
236 | 		{ // Support presign url
237 | 			if strings.HasPrefix(binfo.url, "http") {
238 | 				fmt.Printf("Presign URL: %s\n", binfo.url)
239 | 				continue
240 | 			}
241 | 
242 | 			// Verify the local path
243 | 			urlInfo, err := os.Stat(binfo.url)
244 | 			if err != nil {
245 | 				log.Printf("Invalid path, try to create directories: %s\n", binfo.url) // 自动创建新目录
246 | 				if err := os.MkdirAll(binfo.url, 0755); err != nil {
247 | 					log.Fatalln("Failed to create directories:", binfo.url, err)
248 | 				}
249 | 			} else {
250 | 				if urlInfo.IsDir() && !strings.HasSuffix(binfo.url, string(os.PathSeparator)) {
251 | 					binfo.url += string(os.PathSeparator)
252 | 				}
253 | 				fmt.Printf("Local: %s\n", binfo.url)
254 | 			}
255 | 		}
256 | 	}
257 | 	if cfg.WorkMode == "SQS_SEND" || cfg.WorkMode == "SQS_CONSUME" {
258 | 		sqsSvc = getSQSsess()
259 | 	}
260 | }
261 | 
262 | func main() {
263 | 	startTime := time.Now()
264 | 	getConfig()
265 | 	fmt.Printf(" Target StorageClass(default: STANDARD): %s\n Target ACL(default: private): %s\n Source noSignRequest: %t\n Source requestPayer: %t\n", to.storageClass, to.ACL, from.noSignRequest, from.requestPayer)
266 | 	fmt.Printf(" Transfer Metadata: %t\n List Target Before Transfer(Recommended): %t\n Skip Compare Before Transfer: %t\n", cfg.TransferMetadata, cfg.ListTarget, cfg.SkipCompare)
267 | 	fmt.Printf(" NumWorkers: %d for concurrency files; NumWorkers*4 for parts of each file and for listing target bucket\n", cfg.NumWorkers)
268 | 	fmt.Printf(" HttpTimeout: %ds\n MaxRetries: %d\n ResumableThreshold: %s\n", cfg.HttpTimeout, cfg.MaxRetries, ByteCountSI(cfg.ResumableThreshold))
269 | 	fmt.Printf(" ChunkSize: %s\n", ByteCountSI(cfg.ChunkSize))
270 | 	fmt.Printf(" WorkMode: %s\n SQS_PROFILE: %s\n SQS_URL: %s\n", cfg.WorkMode, cfg.SQSProfile, cfg.SQSUrl)
271 | 	// fmt.Printf("Start to transfer data? (y/n): \n")
272 | 	// if !cfg.YPtr {
273 | 	// 	var answer string
274 | 	// 	fmt.Scanln(&answer)
275 | 	// 	if answer != "y" {
276 | 	// 		log.Fatalln("Exit app with n command.")
277 | 	// 	}
278 | 	// }
279 | 	switch {
280 | 	case strings.ToUpper(cfg.WorkMode) == "DRYRUN":
281 | 		err := compareBucket(from, to, nil)
282 | 		if err != nil {
283 | 			log.Println("Failed to count:", err)
284 | 			return
285 | 		}
286 | 	case strings.ToUpper(cfg.WorkMode) == "SQS_SEND":
287 | 		err := compareBucket(from, to, sqsSvc)
288 | 		if err != nil {
289 | 			log.Println("Failed to send sqs:", err)
290 | 			return
291 | 		}
292 | 	case strings.ToUpper(cfg.WorkMode) == "SQS_CONSUME":
293 | 		err := consumeSQS(sqsSvc)
294 | 		if err != nil {
295 | 			log.Println("Failed to consume sqs:", err)
296 | 			return
297 | 		}
298 | 	case strings.HasPrefix(from.url, "s3://") && strings.HasPrefix(to.url, "s3://"):
299 | 		cfg.WorkMode = "S3TOS3"
300 | 		err := s3tos3(from, to)
301 | 		if err != nil {
302 | 			log.Println("Failed to s3tos3:", err)
303 | 			return
304 | 		}
305 | 	case strings.HasPrefix(from.url, "s3://"):
306 | 		cfg.WorkMode = "GET"
307 | 		err := startDownload(from, to)
308 | 		if err != nil {
309 | 			log.Println("Failed to download:", err)
310 | 			return
311 | 		}
312 | 	case strings.HasPrefix(to.url, "s3://"):
313 | 		cfg.WorkMode = "PUT"
314 | 		err := startUpload(from, to)
315 | 		if err != nil {
316 | 			log.Println("Failed to upload:", err)
317 | 			return
318 | 		}
319 | 	case strings.HasPrefix(from.url, "http"):
320 | 		cfg.WorkMode = "HTTP_DOWNLOAD"
321 | 		err := startHttpDownload(from, to)
322 | 		if err != nil {
323 | 			log.Println("Failed to download:", err)
324 | 			return
325 | 		}
326 | 	case strings.ToUpper(cfg.WorkMode) == "HTTP_DOWNLOAD_LIST":
327 | 		err := startHttpDownload(from, to)
328 | 		if err != nil {
329 | 			log.Println("Failed to download from list:", err)
330 | 			return
331 | 		}
332 | 	default:
333 | 		log.Fatal("ERR WorkMode, invalid FROM_URL or TO_URL")
334 | 	}
335 | 	log.Printf("\n\nTotalObjects:%d, TotalSizes:%s(%d). The program ran for %v\n", objectCount, ByteCountSI(sizeCount), sizeCount, time.Since(startTime))
336 | 	log.Println("From:", from.url)
337 | 	log.Println("To:", to.url)
338 | }
339 | 
340 | type RetryRoundTripper struct {
341 | 	Proxied http.RoundTripper
342 | 	Retries int
343 | 	Delay   time.Duration
344 | }
345 | 
346 | func (rrt *RetryRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
347 | 	var resp *http.Response
348 | 	var err error
349 | 
350 | 	for i := 0; i <= rrt.Retries; i++ {
351 | 		resp, err = rrt.Proxied.RoundTrip(req)
352 | 		if err != nil {
353 | 			log.Printf("HTTP API Request failed and retry: %s", err)
354 | 			time.Sleep(rrt.Delay)
355 | 			continue
356 | 		}
357 | 		break
358 | 	}
359 | 	return resp, err
360 | }
361 | func getSess(bInfo *BInfo) *session.Session {
362 | 	// 创建具有超时重试的 http 客户端
363 | 	client := &http.Client{
364 | 		Timeout: time.Duration(cfg.HttpTimeout) * time.Second,
365 | 		Transport: &RetryRoundTripper{
366 | 			Proxied: http.DefaultTransport,
367 | 			Retries: cfg.MaxRetries,
368 | 			Delay:   time.Duration(cfg.RetryDelay) * time.Second,
369 | 		},
370 | 	}
371 | 	config := aws.Config{
372 | 		MaxRetries: aws.Int(cfg.MaxRetries), // 自定义S3 Client最大重试次数
373 | 		HTTPClient: client,                  // 使用自定义了超时时间的 http 客户端
374 | 	}
375 | 	if cfg.ForcePathStyle {
376 | 		config.S3ForcePathStyle = aws.Bool(true) // 以路径方式访问 而不是域名
377 | 	}
378 | 	if bInfo.endpoint != "" {
379 | 		completeEndpointURL(bInfo) // 自动完善endpoint url
380 | 		config.Endpoint = aws.String(bInfo.endpoint)
381 | 	}
382 | 	// 如果noSignRequest 则必须要有region
383 | 	if bInfo.noSignRequest {
384 | 		if bInfo.region != "" {
385 | 			config.Credentials = credentials.AnonymousCredentials
386 | 		} else {
387 | 			log.Fatalf("No region specified for noSignRequest bucket: %s\n", bInfo.bucket)
388 | 		}
389 | 	} else if bInfo.region == "" {
390 | 		// Call GetBucketLocation to determine the bucket's region.
391 | 		tempS3sess, err := session.NewSessionWithOptions(session.Options{
392 | 			Config:            config,
393 | 			Profile:           bInfo.profile, // ~/.aws/目录下，文件名为config或者credentials
394 | 			SharedConfigState: session.SharedConfigEnable,
395 | 		})
396 | 		if err != nil {
397 | 			log.Fatalf("Failed to create session with reading ~/.aws/credentials profile: %s, with endpoint: %s err: %v\n", bInfo.profile, bInfo.endpoint, err)
398 | 		}
399 | 		result, err := s3.New(tempS3sess).GetBucketLocation(&s3.GetBucketLocationInput{
400 | 			Bucket: aws.String(bInfo.bucket),
401 | 		})
402 | 		if err != nil {
403 | 			log.Fatalf("Failed to get bucket location: %s, err: %v\n", bInfo.bucket, err)
404 | 		}
405 | 		if result.LocationConstraint == nil {
406 | 			bInfo.region = "us-east-1" // Default bucket's region is us-east-1
407 | 		} else {
408 | 			bInfo.region = aws.StringValue(result.LocationConstraint)
409 | 		}
410 | 	}
411 | 	config.Region = aws.String(bInfo.region)
412 | 	sess, err := session.NewSessionWithOptions(session.Options{
413 | 		Config:            config,
414 | 		Profile:           bInfo.profile,
415 | 		SharedConfigState: session.SharedConfigEnable,
416 | 	})
417 | 	if err != nil {
418 | 		log.Fatalf("Failed to create session with reading ~/.aws/credentials profile: %s, in bucket region: %s, with endpoint: %s err: %v\n", bInfo.profile, bInfo.region, bInfo.endpoint, err)
419 | 	}
420 | 	return sess
421 | }
422 | 
423 | // 自动完善endpoint url
424 | func completeEndpointURL(bInfo *BInfo) {
425 | 	switch bInfo.endpoint {
426 | 	case "Aliyun_OSS":
427 | 		if bInfo.region == "" {
428 | 			log.Fatalf("No region specified for bucket: %s\n", bInfo.bucket)
429 | 		}
430 | 		bInfo.endpoint = fmt.Sprintf("https://oss-%s.aliyuncs.com", bInfo.region)
431 | 	case "Tencent_COS":
432 | 		if bInfo.region == "" {
433 | 			log.Fatalf("No region specified for bucket:%s\n", bInfo.bucket)
434 | 		}
435 | 		bInfo.endpoint = fmt.Sprintf("https://cos.%s.myqcloud.com", bInfo.region)
436 | 	case "Google_GCS":
437 | 		bInfo.endpoint = "https://storage.googleapis.com"
438 | 	}
439 | 	// 都不是以上定义字符串则自直接使用endpoint url的字符串
440 | 
441 | }
442 | 
443 | func getSQSsess() *sqs.SQS {
444 | 	// get region from cfg.SQSUrl "https://sqs.us-east-1.amazonaws.com/my_account/my_queue_name"
445 | 	u, err := url.Parse(cfg.SQSUrl)
446 | 	if err != nil {
447 | 		log.Fatalln("fail to parse SQS url", err)
448 | 	}
449 | 	hostParts := strings.Split(u.Host, ".")
450 | 	if len(hostParts) < 2 {
451 | 		log.Fatalln("Invalid SQS URL")
452 | 	}
453 | 	SQSRegion := hostParts[1]
454 | 
455 | 	// 创建具有超时的 http 客户端
456 | 	client := &http.Client{Timeout: time.Duration(cfg.HttpTimeout) * time.Second}
457 | 	config := aws.Config{
458 | 		MaxRetries: aws.Int(cfg.MaxRetries), // 自定义S3 Client最大重试次数
459 | 		HTTPClient: client,                  // 使用自定义了超时时间的 http 客户端
460 | 		Region:     aws.String(SQSRegion),
461 | 	}
462 | 	sqssess, err := session.NewSessionWithOptions(session.Options{
463 | 		Config:            config,
464 | 		Profile:           cfg.SQSProfile,
465 | 		SharedConfigState: session.SharedConfigEnable,
466 | 	})
467 | 	if err != nil {
468 | 		log.Fatalf("Failed to create SQS session with reading ~/.aws/credentials profile: %s, err: %v\n", from.profile, err)
469 | 	}
470 | 	sqsSvc := sqs.New(sqssess)
471 | 	return sqsSvc
472 | }
473 | 


--------------------------------------------------------------------------------
/s3tos3.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"context"
  6 | 	"crypto/md5"
  7 | 	"encoding/base64"
  8 | 	"log"
  9 | 	"mime"
 10 | 	"path"
 11 | 	"path/filepath"
 12 | 	"strings"
 13 | 	"sync"
 14 | 	"sync/atomic"
 15 | 
 16 | 	"github.com/aws/aws-sdk-go/aws"
 17 | 	"github.com/aws/aws-sdk-go/service/s3"
 18 | 	"github.com/aws/aws-sdk-go/service/s3/s3manager"
 19 | 	"golang.org/x/sync/semaphore"
 20 | )
 21 | 
 22 | func s3tos3(from, to BInfo) error {
 23 | 	ignoreList := getIgnoreList()
 24 | 	var wg sync.WaitGroup
 25 | 	semFile := semaphore.NewWeighted(int64(cfg.NumWorkers)) // 并发量为NumWorkers的信号量 for file
 26 | 
 27 | 	targetObjectList := make([]*s3.Object, 0)
 28 | 	var err error
 29 | 	if cfg.ListTarget && !cfg.SkipCompare {
 30 | 		targetObjectList, err = getS3ObjectList(to) // 获取目标 S3 桶中的文件列表
 31 | 		if err != nil {
 32 | 			return err
 33 | 		}
 34 | 	}
 35 | 
 36 | 	multipartUploadsList, _ := getMultipartUploadList(to.svc, to.bucket, to.prefix)
 37 | 
 38 | 	// 遍历源S3
 39 | 	inputListSource := &s3.ListObjectsV2Input{
 40 | 		Bucket: aws.String(from.bucket),
 41 | 		Prefix: aws.String(from.prefix),
 42 | 	}
 43 | 	if from.requestPayer {
 44 | 		inputListSource.RequestPayer = aws.String("requester")
 45 | 	}
 46 | 	log.Printf("Listing srouce s3://%s\n", path.Join(from.bucket, from.prefix))
 47 | 	err = from.svc.ListObjectsV2Pages(inputListSource, func(page *s3.ListObjectsV2Output, lastPage bool) bool {
 48 | 		for _, item := range page.Contents {
 49 | 			// Skip if the object is a directory
 50 | 			if strings.HasSuffix(*item.Key, "/") {
 51 | 				log.Println("...Skipping directory", *item.Key)
 52 | 				continue
 53 | 			}
 54 | 
 55 | 			// Skip if key in ignoreList
 56 | 			if isIgnored(*item.Key, ignoreList) {
 57 | 				log.Println("...Skiping ignored key in ignoreList", *item.Key)
 58 | 			}
 59 | 
 60 | 			var combinedKey string
 61 | 			if *item.Key != from.prefix {
 62 | 				combinedKey = path.Join(to.prefix, strings.TrimPrefix(*item.Key, from.prefix))
 63 | 				combinedKey = strings.TrimPrefix(combinedKey, "/")
 64 | 			} else {
 65 | 				combinedKey = path.Join(to.prefix, path.Base(*item.Key))
 66 | 			}
 67 | 			contentType := mime.TypeByExtension(filepath.Ext(*item.Key))
 68 | 			fileInfo := FileInfo{
 69 | 				FromBucket: from.bucket,
 70 | 				FromKey:    *item.Key,
 71 | 				ToBucket:   to.bucket,
 72 | 				ToKey:      combinedKey,
 73 | 				Size:       *item.Size,
 74 | 				Others:     MetaStruct{ContentType: &contentType},
 75 | 			}
 76 | 			err = s3tos3Action(from, to, fileInfo, semFile, &wg, multipartUploadsList, targetObjectList)
 77 | 			if err != nil {
 78 | 				log.Println("Failed to s3tos3Action", err)
 79 | 				return false
 80 | 			}
 81 | 		}
 82 | 		return true
 83 | 	})
 84 | 	if err != nil {
 85 | 		log.Println("Failed to list objects", err)
 86 | 		return err
 87 | 	}
 88 | 	wg.Wait()
 89 | 	return err
 90 | }
 91 | 
 92 | func s3tos3Action(from, to BInfo, fileInfo FileInfo, semFile *semaphore.Weighted, wg *sync.WaitGroup, multipartUploadsList []*s3.MultipartUpload, targetObjectList []*s3.Object) error {
 93 | 	if cfg.TransferMetadata {
 94 | 		err := getMetadata(from, &fileInfo)
 95 | 		if err != nil {
 96 | 			return err
 97 | 		}
 98 | 	}
 99 | 
100 | 	// Check file exist on S3 Bucket and get uploadId
101 | 	uploadId, err := getUploadId(to.svc, fileInfo, multipartUploadsList, targetObjectList)
102 | 	if err != nil {
103 | 		return err
104 | 	}
105 | 	if uploadId == "NEXT" {
106 | 		log.Printf("...File exists and same size. Skipping target. s3://%s\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey))
107 | 		return nil
108 | 	}
109 | 
110 | 	semFile.Acquire(context.Background(), 1) //从线程信号池中获取，没有线程可用了就阻塞等待
111 | 	atomic.AddInt32(&runningGoroutines, 1)   //线程计数
112 | 	wg.Add(1)
113 | 	go func() {
114 | 		defer wg.Done()
115 | 		defer semFile.Release(1)                      //释放线程信号池
116 | 		defer atomic.AddInt32(&runningGoroutines, -1) //线程计数
117 | 
118 | 		if fileInfo.Size < cfg.ResumableThreshold {
119 | 			err := transferSmall(from, to, fileInfo)
120 | 			if err != nil {
121 | 				log.Println("Failed to transferSmall", err)
122 | 				return
123 | 			}
124 | 		} else {
125 | 			// >= ResumableThreshold
126 | 			log.Printf("   Start to transfer (>= ResumableThreshold) s3://%s, runningGoroutines: %d\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey), runningGoroutines)
127 | 			err := transferMultipart(from, to, uploadId, fileInfo)
128 | 			if err != nil {
129 | 				log.Println("Failed to multipartProccess", err)
130 | 				return
131 | 			}
132 | 		}
133 | 		log.Printf("***Successfully transfered s3://%s\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey))
134 | 		atomic.AddInt64(&objectCount, 1)
135 | 		atomic.AddInt64(&sizeCount, fileInfo.Size)
136 | 	}()
137 | 	return nil
138 | }
139 | 
140 | func transferSmall(from, to BInfo, fileInfo FileInfo) error {
141 | 	log.Printf("   Start to download (< ResumableThreshold) from %s/%s, runningGoroutines: %d\n", fileInfo.FromBucket, fileInfo.FromKey, runningGoroutines)
142 | 	buff := &aws.WriteAtBuffer{}
143 | 	inputDownload := &s3.GetObjectInput{
144 | 		Bucket: aws.String(fileInfo.FromBucket),
145 | 		Key:    aws.String(fileInfo.FromKey),
146 | 	}
147 | 	if from.requestPayer {
148 | 		inputDownload.RequestPayer = aws.String("requester")
149 | 	}
150 | 	_, err := from.downloader.Download(buff, inputDownload)
151 | 	if err != nil {
152 | 		log.Println("Error download from", from.url, err)
153 | 		return err
154 | 	}
155 | 
156 | 	md5Hash := md5.Sum(buff.Bytes())
157 | 	md5Str := base64.StdEncoding.EncodeToString(md5Hash[:])
158 | 	log.Printf("   Start to upload (< ResumableThreshold) to %s/%s, runningGoroutines: %d\n", fileInfo.ToBucket, fileInfo.ToKey, runningGoroutines)
159 | 	inputUpload := &s3manager.UploadInput{
160 | 		Bucket:     aws.String(fileInfo.ToBucket),
161 | 		Key:        aws.String(fileInfo.ToKey),
162 | 		Body:       bytes.NewReader(buff.Bytes()),
163 | 		ContentMD5: aws.String(md5Str),
164 | 	}
165 | 	if to.storageClass != "" {
166 | 		inputUpload.StorageClass = aws.String(to.storageClass)
167 | 	}
168 | 	if to.ACL != "" {
169 | 		inputUpload.ACL = aws.String(to.ACL)
170 | 	}
171 | 
172 | 	if fileInfo.Others.ContentType != nil && *fileInfo.Others.ContentType != "" {
173 | 		inputUpload.ContentType = fileInfo.Others.ContentType
174 | 	}
175 | 	if cfg.TransferMetadata {
176 | 		inputUpload.Metadata = fileInfo.Others.Metadata
177 | 		inputUpload.ContentEncoding = fileInfo.Others.ContentEncoding
178 | 		inputUpload.ContentLanguage = fileInfo.Others.ContentLanguage
179 | 		inputUpload.CacheControl = fileInfo.Others.CacheControl
180 | 		inputUpload.ContentDisposition = fileInfo.Others.ContentDisposition
181 | 	}
182 | 	_, err = to.uploader.Upload(inputUpload)
183 | 	if err != nil {
184 | 		log.Println("Error upload to", to.url, err)
185 | 		return err
186 | 	}
187 | 	return nil
188 | }
189 | 
190 | func transferPart(from, to BInfo, partInfo PartInfo, wg *sync.WaitGroup, sem *semaphore.Weighted, uploadId string, partnumberList *[]PartInfo, partnumberListMutex *sync.Mutex) error {
191 | 	defer wg.Done()
192 | 	defer sem.Release(1)
193 | 	defer atomic.AddInt32(&runningGoroutines, -1)
194 | 
195 | 	// Download part S3 API Call
196 | 	buffer, err := downloadPartAction(from.svc, partInfo)
197 | 	if err != nil {
198 | 		return err
199 | 	}
200 | 	// Upload part S3 API Call
201 | 	err = uploadPartAction(buffer, partInfo, to.svc, uploadId, partnumberList, partnumberListMutex)
202 | 	if err != nil {
203 | 		return err
204 | 	}
205 | 	return nil
206 | }
207 | 


--------------------------------------------------------------------------------
/sqs2trans.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"net/url"
  8 | 	"os"
  9 | 	"path"
 10 | 	"strings"
 11 | 	"sync"
 12 | 	"time"
 13 | 
 14 | 	"github.com/aws/aws-sdk-go/aws"
 15 | 	"github.com/aws/aws-sdk-go/service/s3"
 16 | 	"github.com/aws/aws-sdk-go/service/sqs"
 17 | 	"golang.org/x/sync/semaphore"
 18 | )
 19 | 
 20 | type S3 struct {
 21 | 	Bucket struct {
 22 | 		Name string
 23 | 	}
 24 | 	Object struct {
 25 | 		Key  string
 26 | 		Size int64
 27 | 	}
 28 | }
 29 | 
 30 | type Record struct {
 31 | 	EventVersion string
 32 | 	EventSource  string
 33 | 	AwsRegion    string
 34 | 	EventTime    string
 35 | 	EventName    string
 36 | 	S3           S3
 37 | }
 38 | 
 39 | type Message struct {
 40 | 	Records []Record
 41 | 	Event   string
 42 | }
 43 | 
 44 | // CompareBucket and send SQS message, not checking Head file (no TransferMetadata)
 45 | func compareBucket(from, to BInfo, sqsSvc *sqs.SQS) error {
 46 | 	var wg sync.WaitGroup
 47 | 	var ignoreList []*string
 48 | 	var fromList, toList []*s3.Object
 49 | 	var err error
 50 | 	var jobList []Message
 51 | 
 52 | 	wg.Add(3)
 53 | 	go func() {
 54 | 		defer wg.Done()
 55 | 		ignoreList = getIgnoreList()
 56 | 		if err != nil {
 57 | 			log.Fatalln(err)
 58 | 		}
 59 | 	}()
 60 | 	go func() {
 61 | 		defer wg.Done()
 62 | 		fromList, err = getS3ObjectList(from)
 63 | 		if err != nil {
 64 | 			log.Fatalln(err)
 65 | 		}
 66 | 	}()
 67 | 	go func() {
 68 | 		defer wg.Done()
 69 | 		toList, err = getS3ObjectList(to)
 70 | 		if err != nil {
 71 | 			log.Fatalln(err)
 72 | 		}
 73 | 	}()
 74 | 	wg.Wait()
 75 | 	// Compare each objects's name and size, pick up the delta
 76 | 	jobList, sizeCount = compareS3Objects(fromList, toList, ignoreList, from, to)
 77 | 	objectCount = int64(len(jobList))
 78 | 
 79 | 	// sqsSvc 如果nil就是不发SQS，只统计和写log
 80 | 	if sqsSvc != nil {
 81 | 		log.Printf("Found %d jobs to send SQS\n", objectCount)
 82 | 		wg.Add(1)
 83 | 		go func() {
 84 | 			// Send SQS Message in batch
 85 | 			defer wg.Done()
 86 | 			log.Printf("Uploading jobs to SQS queue: %s\n", cfg.SQSUrl)
 87 | 			err = sendSQS(jobList, sqsSvc)
 88 | 			if err != nil {
 89 | 				log.Println("Failed to send SQS messages", err)
 90 | 			}
 91 | 		}()
 92 | 	}
 93 | 
 94 | 	// Write jobList to file
 95 | 	if cfg.JobListPath != "" {
 96 | 		wg.Add(1)
 97 | 		go func() {
 98 | 			// Write Messages to file
 99 | 			defer wg.Done()
100 | 			log.Println("Writing SQS messages to file", cfg.JobListPath)
101 | 			err = writeJobListFile(jobList, cfg.JobListPath)
102 | 			if err != nil {
103 | 				log.Println("Failed to write SQS messages to file", err)
104 | 			}
105 | 		}()
106 | 	}
107 | 	wg.Wait()
108 | 	return nil
109 | }
110 | 
111 | // Compare S3 Objects, return delta list
112 | func compareS3Objects(fromList, toList []*s3.Object, ignoreList []*string, from, to BInfo) ([]Message, int64) {
113 | 	var listSizeCount = int64(0)
114 | 	delta := make([]Message, 0)
115 | 	fromMap := make(map[string]*s3.Object)
116 | 	toMap := make(map[string]*s3.Object)
117 | 
118 | 	for _, obj := range fromList {
119 | 		if !isIgnored(*obj.Key, ignoreList) {
120 | 			fromMap[*obj.Key] = obj
121 | 		}
122 | 	}
123 | 	for _, obj := range toList {
124 | 		toMap[*obj.Key] = obj
125 | 	}
126 | 
127 | 	// 只生成在From有而To没有的或Size不同的
128 | 	for key, fromObj := range fromMap {
129 | 		// 根据源和目标的 prefix 创建目标 key
130 | 		toKey := path.Join(to.prefix, strings.TrimPrefix(key, from.prefix))
131 | 
132 | 		toObj, ok := toMap[toKey]
133 | 		if !ok || *toObj.Size != *fromObj.Size {
134 | 			records := []Record{
135 | 				{
136 | 					EventVersion: "2.1",
137 | 					EventSource:  "aws:s3",
138 | 					AwsRegion:    from.region,                     // You may need to update this
139 | 					EventTime:    time.Now().Format(time.RFC3339), // Use current time
140 | 					EventName:    "ObjectCreated:Put",             // Assume object creation
141 | 					S3: S3{
142 | 						Bucket: struct {
143 | 							Name string
144 | 						}{
145 | 							Name: from.bucket,
146 | 						},
147 | 						Object: struct {
148 | 							Key  string
149 | 							Size int64
150 | 						}{
151 | 							Key:  key,
152 | 							Size: *fromObj.Size,
153 | 						},
154 | 					},
155 | 				},
156 | 			}
157 | 			msg := Message{Records: records}
158 | 			delta = append(delta, msg)
159 | 			listSizeCount += *fromObj.Size
160 | 		}
161 | 	}
162 | 	return delta, listSizeCount
163 | }
164 | 
165 | // Send SQS Message in batch with concurrency goroutines
166 | func sendSQS(jobList []Message, sqsSvc *sqs.SQS) error {
167 | 	var sqsBatch int
168 | 	var sqsMessage []*sqs.SendMessageBatchRequestEntry
169 | 	var wg sync.WaitGroup
170 | 	BatchSize := 10
171 | 
172 | 	// Create a buffered channel to hold the jobs 要并发写SQS，所以用channel做buffer
173 | 	jobs := make(chan []*sqs.SendMessageBatchRequestEntry, cfg.NumWorkers)
174 | 
175 | 	// Start the workers concurrency cfg.NumWorkers
176 | 	for i := 0; i < cfg.NumWorkers; i++ {
177 | 		wg.Add(1)
178 | 		go sendSQSWorker(i, jobs, &wg, sqsSvc)
179 | 	}
180 | 
181 | 	// Send SQS Message in batch
182 | 	for i, job := range jobList {
183 | 		jobJSON, err := json.Marshal(job)
184 | 		if err != nil {
185 | 			return fmt.Errorf("failed to marshal job: %v", err)
186 | 		}
187 | 		sqsMessage = append(sqsMessage, &sqs.SendMessageBatchRequestEntry{
188 | 			Id:          aws.String(fmt.Sprint(i)),
189 | 			MessageBody: aws.String(string(jobJSON)),
190 | 		})
191 | 		sqsBatch++
192 | 
193 | 		if sqsBatch == BatchSize || i == len(jobList)-1 {
194 | 			// Copy sqsMessage to prevent data race
195 | 			sqsMessageCopy := make([]*sqs.SendMessageBatchRequestEntry, len(sqsMessage))
196 | 			copy(sqsMessageCopy, sqsMessage)
197 | 
198 | 			// Send a job to the workers
199 | 			jobs <- sqsMessageCopy
200 | 
201 | 			sqsBatch = 0
202 | 			sqsMessage = sqsMessage[:0]
203 | 		}
204 | 	}
205 | 
206 | 	close(jobs) // close the jobs channel
207 | 	wg.Wait()
208 | 	log.Printf("Complete upload job to queue: %s\n", cfg.SQSUrl)
209 | 	return nil
210 | }
211 | 
212 | func sendSQSWorker(id int, jobs <-chan []*sqs.SendMessageBatchRequestEntry, wg *sync.WaitGroup, sqsSvc *sqs.SQS) {
213 | 	defer wg.Done()
214 | 	var file *os.File
215 | 	var err error
216 | 	var logPath string
217 | 
218 | 	// Prepare SQS sent log for writing a file, it is for backup
219 | 	if cfg.SQSSentLogName != "" {
220 | 		// Create SQS sent log file
221 | 		dateTimePrefix := time.Now().Format("20060102150405")
222 | 		logPath = fmt.Sprintf("%s-%s-sqs-sent-%d.log", cfg.SQSSentLogName, dateTimePrefix, id)
223 | 		file, err = os.Create(logPath)
224 | 		if err != nil {
225 | 			log.Printf("Failed to create SQS sent log file: %v\n", err)
226 | 			return
227 | 		}
228 | 		defer file.Close()
229 | 	}
230 | 
231 | 	for job := range jobs {
232 | 
233 | 		// Send Message to SQS
234 | 		_, err := sqsSvc.SendMessageBatch(&sqs.SendMessageBatchInput{
235 | 			QueueUrl: aws.String(cfg.SQSUrl),
236 | 			Entries:  job,
237 | 		})
238 | 		if err != nil {
239 | 			log.Printf("Worker %d: Failed to send sqs message: %v; JobList: %v\n", id, err, job)
240 | 			continue
241 | 		}
242 | 
243 | 		// Write SQS sent log to file for backup
244 | 		if cfg.SQSSentLogName != "" {
245 | 			for _, entry := range job {
246 | 				var messageBody map[string]interface{}
247 | 				err := json.Unmarshal([]byte(*entry.MessageBody), &messageBody)
248 | 				if err != nil {
249 | 					log.Printf("Worker %d: Failed to unmarshal MessageBody: %v\n", id, err)
250 | 					continue
251 | 				}
252 | 				messageBodyJson, err := json.Marshal(messageBody)
253 | 				if err != nil {
254 | 					log.Printf("Worker %d: Failed to marshal MessageBody to JSON: %v\n", id, err)
255 | 					continue
256 | 				}
257 | 				_, err = file.WriteString(string(messageBodyJson) + "\n")
258 | 				if err != nil {
259 | 					log.Printf("Worker %d: Failed to write SQS sent log: %v\n", id, err)
260 | 					continue
261 | 				}
262 | 			}
263 | 		}
264 | 	}
265 | 	log.Printf("Worker %d: Complete upload job to queue\n", id)
266 | 	if cfg.SQSSentLogName != "" {
267 | 		log.Printf("Worker %d: Complete write SQS sent log to file: %s\n", id, logPath)
268 | 	}
269 | }
270 | 
271 | func writeJobListFile(jobList []Message, path string) error {
272 | 
273 | 	// Check if the file exists
274 | 	if _, err := os.Stat(path); os.IsNotExist(err) {
275 | 		// If not, create the file
276 | 		file, err := os.Create(path)
277 | 		if err != nil {
278 | 			fmt.Println("Error creating file: ", err)
279 | 			return err
280 | 		}
281 | 		file.Close()
282 | 	}
283 | 	// Open the file in append mode
284 | 	file, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0644)
285 | 	if err != nil {
286 | 		fmt.Println("Error opening file: ", err)
287 | 		return err
288 | 	}
289 | 	defer file.Close()
290 | 
291 | 	for _, job := range jobList {
292 | 		jobJSON, err := json.Marshal(job)
293 | 		if err != nil {
294 | 			return fmt.Errorf("failed to marshal job: %v", err)
295 | 		}
296 | 		_, err = file.WriteString(string(jobJSON) + "\n")
297 | 		if err != nil {
298 | 			return err
299 | 		}
300 | 	}
301 | 	log.Println("Complete writing job list to file", path)
302 | 
303 | 	return nil
304 | }
305 | 
306 | func consumeSQS(sqsSvc *sqs.SQS) error {
307 | 	var wgsqs sync.WaitGroup                                // 用于等待所有worker完成(适配s3tos3Action)
308 | 	semFile := semaphore.NewWeighted(int64(cfg.NumWorkers)) // 并发量为NumWorkers的信号量(适配s3tos3Action)
309 | 	ignoreList := getIgnoreList()
310 | 	for i := 0; i < cfg.NumWorkers; i++ {
311 | 		wgsqs.Add(1)
312 | 		go getSQSWorker(i, semFile, &wgsqs, sqsSvc, ignoreList)
313 | 	}
314 | 	wgsqs.Wait()
315 | 	return nil
316 | }
317 | 
318 | func getSQSWorker(i int, semFile *semaphore.Weighted, wgsqs *sync.WaitGroup, sqsSvc *sqs.SQS, ignoreList []*string) {
319 | 	defer wgsqs.Done()
320 | 	sqsBatch := aws.Int64(10)
321 | 	var wg sync.WaitGroup
322 | 
323 | 	for {
324 | 		resp, err := sqsSvc.ReceiveMessage(&sqs.ReceiveMessageInput{
325 | 			QueueUrl:            &cfg.SQSUrl,
326 | 			MaxNumberOfMessages: sqsBatch,
327 | 		})
328 | 		if err != nil {
329 | 			log.Printf("Worker %d: Failed to get SQS. Wait for 5 seconds. ERR: %v\n", i, err)
330 | 			time.Sleep(time.Duration(cfg.RetryDelay) * time.Second)
331 | 			continue
332 | 		}
333 | 
334 | 		if len(resp.Messages) == 0 {
335 | 			log.Printf("Worker %d: No message in queue available, wait...", i)
336 | 			time.Sleep(60 * time.Second)
337 | 			continue
338 | 		}
339 | 		log.Printf("Worker %d: Received %d messages\n", i, len(resp.Messages))
340 | 		// 对Batch Message中的每个Record进行处理
341 | 		for _, message := range resp.Messages {
342 | 			var msg Message
343 | 			var transferErr error
344 | 			transferErr = json.Unmarshal([]byte(*message.Body), &msg)
345 | 			if transferErr != nil {
346 | 				log.Printf("Worker %d: Failed to parse SQS message. ERR: %v\n", i, err)
347 | 				continue
348 | 			}
349 | 
350 | 			for _, record := range msg.Records {
351 | 				// 根据源和目标的 prefix 创建目标 key
352 | 				// Decode the key from URL format 避免中间出现 + 号的情况
353 | 				var decodedKey string
354 | 				decodedKey, transferErr = url.QueryUnescape(record.S3.Object.Key)
355 | 				if transferErr != nil {
356 | 					log.Printf("Failed to decode key: %v. ERR: %v\n", record.S3.Object.Key, err)
357 | 					continue // TODO: 这里跳出去之后会跑到SQS Del去了
358 | 				}
359 | 
360 | 				// ignore list
361 | 				if isIgnored(decodedKey, ignoreList) {
362 | 					log.Printf("Skipping ignored key in ignoreList %s\n", decodedKey)
363 | 					continue
364 | 				}
365 | 
366 | 				toKey := path.Join(to.prefix, strings.TrimPrefix(decodedKey, from.prefix))
367 | 				fileInfo := FileInfo{
368 | 					FromKey:    decodedKey,
369 | 					FromBucket: record.S3.Bucket.Name,
370 | 					Size:       record.S3.Object.Size,
371 | 					ToBucket:   to.bucket,
372 | 					ToKey:      toKey,
373 | 				}
374 | 
375 | 				// Transfer object
376 | 				if strings.HasPrefix(record.EventName, "ObjectCreated:") {
377 | 					targetObjectList := make([]*s3.Object, 0) // 按照SQS消息来传输，将忽略targetObjectList
378 | 					multipartUploadsList := make([]*s3.MultipartUpload, 0)
379 | 					if fileInfo.Size >= cfg.ResumableThreshold {
380 | 						multipartUploadsList, _ = getMultipartUploadList(to.svc, fileInfo.ToBucket, fileInfo.ToKey) // 查当前key是否有未完成的Multipart Upload
381 | 					}
382 | 					transferErr = s3tos3Action(from, to, fileInfo, semFile, &wg, multipartUploadsList, targetObjectList)
383 | 					wg.Wait()
384 | 					if transferErr != nil {
385 | 						log.Printf("Worker %d: Failed to transfer object: %v\n", i, err)
386 | 						continue // TODO: 这里跳出去之后会跑到SQS Del去了
387 | 					}
388 | 				}
389 | 				// Delete object
390 | 				if strings.HasPrefix(record.EventName, "ObjectRemoved:") {
391 | 					transferErr = delObjcet(to.svc, fileInfo.ToBucket, fileInfo.ToKey)
392 | 				}
393 | 			}
394 | 
395 | 			// Skip processing for "s3:TestEvent"
396 | 			if msg.Event == "s3:TestEvent" {
397 | 				fmt.Println("Skipping Test Event")
398 | 			}
399 | 			// Delete SQS message
400 | 			if transferErr == nil {
401 | 				err = delSQS(message, sqsSvc)
402 | 				if err != nil {
403 | 					log.Printf("Worker %d: Failed to delete SQS message: %v\n", i, err)
404 | 					continue
405 | 				}
406 | 			}
407 | 		}
408 | 	}
409 | }
410 | 
411 | func delSQS(message *sqs.Message, sqsSvc *sqs.SQS) error {
412 | 	_, err := sqsSvc.DeleteMessage(&sqs.DeleteMessageInput{
413 | 		QueueUrl:      &cfg.SQSUrl,
414 | 		ReceiptHandle: message.ReceiptHandle,
415 | 	})
416 | 	if err != nil {
417 | 		return err
418 | 	}
419 | 	return nil
420 | }
421 | 
422 | func delObjcet(svc *s3.S3, bucket, key string) error {
423 | 	_, err := svc.DeleteObject(&s3.DeleteObjectInput{
424 | 		Bucket: aws.String(bucket),
425 | 		Key:    aws.String(key),
426 | 	})
427 | 	if err != nil {
428 | 		return err
429 | 	}
430 | 	return nil
431 | }
432 | 


--------------------------------------------------------------------------------
/upload.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"context"
  6 | 	"crypto/md5"
  7 | 	"encoding/base64"
  8 | 	"errors"
  9 | 	"io"
 10 | 	"log"
 11 | 	"mime"
 12 | 	"os"
 13 | 	"path"
 14 | 	"path/filepath"
 15 | 	"strings"
 16 | 	"sync"
 17 | 	"sync/atomic"
 18 | 
 19 | 	"github.com/aws/aws-sdk-go/aws"
 20 | 	"github.com/aws/aws-sdk-go/service/s3"
 21 | 	"github.com/aws/aws-sdk-go/service/s3/s3manager"
 22 | 	"golang.org/x/sync/semaphore"
 23 | )
 24 | 
 25 | func startUpload(from, to BInfo) error {
 26 | 	ignoreList := getIgnoreList()
 27 | 	var wg sync.WaitGroup
 28 | 	semFile := semaphore.NewWeighted(int64(cfg.NumWorkers)) // 并发量为numWorkers的信号量 for file
 29 | 
 30 | 	// List Target S3
 31 | 	targetObjectList := make([]*s3.Object, 0)
 32 | 	var err error
 33 | 	if cfg.ListTarget && !cfg.SkipCompare {
 34 | 		targetObjectList, err = getS3ObjectList(to) // 获取目标 S3 桶中的文件列表
 35 | 		if err != nil {
 36 | 			return err
 37 | 		}
 38 | 	}
 39 | 
 40 | 	// Listing multipart uploads ID
 41 | 	multipartUploadsList, err := getMultipartUploadList(to.svc, to.bucket, to.prefix)
 42 | 	if err != nil {
 43 | 		return err
 44 | 	}
 45 | 
 46 | 	// Walk through local path for uploading
 47 | 	err = filepath.Walk(from.url, func(thispath string, info os.FileInfo, err error) error {
 48 | 		if err != nil {
 49 | 			log.Println("Failed to access path", thispath, err)
 50 | 			return err
 51 | 		}
 52 | 		// Skip if key in ignoreList
 53 | 		if isIgnored(thispath, ignoreList) {
 54 | 			log.Println("...Skiping ignored key in ignoreList", thispath)
 55 | 		}
 56 | 		// Skip if the path is a directory
 57 | 		if info.IsDir() {
 58 | 			return nil
 59 | 		}
 60 | 		combinedKey := path.Join(to.prefix, filepath.ToSlash(strings.TrimPrefix(thispath, filepath.Dir(from.url))))
 61 | 		contentType := mime.TypeByExtension(filepath.Ext(thispath))
 62 | 		fileInfo := FileInfo{
 63 | 			ToKey:    combinedKey,
 64 | 			ToBucket: to.bucket,
 65 | 			Size:     info.Size(),
 66 | 			Others:   MetaStruct{ContentType: &contentType},
 67 | 		}
 68 | 
 69 | 		// Check file exist on S3 Bucket and get uploadId
 70 | 		uploadId, err := getUploadId(to.svc, fileInfo, multipartUploadsList, targetObjectList)
 71 | 		if err != nil {
 72 | 			return err
 73 | 		}
 74 | 		if uploadId == "NEXT" {
 75 | 			log.Printf("...File already exists. Skipping... s3://%s\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey))
 76 | 			return nil
 77 | 		}
 78 | 
 79 | 		semFile.Acquire(context.Background(), 1) //从线程信号池中获取，没有线程可用了就阻塞等待
 80 | 		atomic.AddInt32(&runningGoroutines, 1)   //线程计数
 81 | 		wg.Add(1)
 82 | 		go func(thispath string, info os.FileInfo, uploadId string) {
 83 | 			defer wg.Done()
 84 | 			defer semFile.Release(1)
 85 | 			defer atomic.AddInt32(&runningGoroutines, -1)
 86 | 
 87 | 			fileInfo.File, err = os.Open(thispath)
 88 | 			if err != nil {
 89 | 				log.Println("Failed to open file", thispath, err)
 90 | 				return
 91 | 			}
 92 | 			defer fileInfo.File.Close()
 93 | 
 94 | 			if info.Size() < cfg.ResumableThreshold {
 95 | 				log.Printf("   Start to upload (< ResumableThreshold): %s to s3://%s, runningGoroutines: %d\n", thispath, path.Join(fileInfo.ToBucket, fileInfo.ToKey), runningGoroutines)
 96 | 				err := uploadSmall(fileInfo, thispath, info, uploadId)
 97 | 				if err != nil {
 98 | 					log.Printf("Failed to uploadSmall: s3://%s, %v\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey), err)
 99 | 					return
100 | 				}
101 | 				// Success upload
102 | 			} else {
103 | 				// info.Size() >= ResumableThreshold Use multipart upload for large files resumable upload
104 | 				log.Printf("   Start to upload (>= ResumableThreshold): %s to s3://%s, runningGoroutines: %d\n", thispath, path.Join(fileInfo.ToBucket, fileInfo.ToKey), runningGoroutines)
105 | 				err := transferMultipart(from, to, uploadId, fileInfo)
106 | 				if err != nil {
107 | 					log.Printf("Failed to multipartProccess: s3://%s, %v\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey), err)
108 | 					return
109 | 				}
110 | 			}
111 | 			log.Printf("***Successfully uploaded: s3://%s\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey))
112 | 			atomic.AddInt64(&objectCount, 1)
113 | 			atomic.AddInt64(&sizeCount, info.Size())
114 | 		}(thispath, info, uploadId)
115 | 		return nil
116 | 	})
117 | 	wg.Wait()
118 | 	if err != nil {
119 | 		log.Println("Failed to walk directory", err)
120 | 		return err
121 | 	}
122 | 	return err
123 | }
124 | 
125 | func uploadSmall(fileInfo FileInfo, thispath string, info os.FileInfo, uploadId string) error {
126 | 	buff, err := io.ReadAll(fileInfo.File)
127 | 	if err != nil {
128 | 		log.Println("Failed to read file", thispath, err)
129 | 		return err
130 | 	}
131 | 
132 | 	md5Hash := md5.Sum(buff)
133 | 	md5Str := base64.StdEncoding.EncodeToString(md5Hash[:])
134 | 
135 | 	inputUpload := &s3manager.UploadInput{
136 | 		Bucket:     aws.String(fileInfo.ToBucket),
137 | 		Key:        aws.String(fileInfo.ToKey),
138 | 		Body:       bytes.NewReader(buff),
139 | 		ContentMD5: aws.String(md5Str),
140 | 	}
141 | 	if to.storageClass != "" {
142 | 		inputUpload.StorageClass = aws.String(to.storageClass)
143 | 	}
144 | 	if to.ACL != "" {
145 | 		inputUpload.ACL = aws.String(to.ACL)
146 | 	}
147 | 	if fileInfo.Others.ContentType != nil && *fileInfo.Others.ContentType != "" {
148 | 		inputUpload.ContentType = fileInfo.Others.ContentType
149 | 	}
150 | 	_, err = to.uploader.Upload(inputUpload)
151 | 	if err != nil {
152 | 		log.Printf("Failed to upload file s3://%s, err: %v\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey), err)
153 | 		return err
154 | 	}
155 | 
156 | 	return nil
157 | }
158 | 
159 | func transferMultipart(from, to BInfo, uploadId string, fileInfo FileInfo) error {
160 | 	semPart := semaphore.NewWeighted(int64(cfg.NumWorkers * 4)) // 并发量为numWorkers的信号量 for parts
161 | 	var partnumberList []PartInfo
162 | 	var partnumberListMutex sync.Mutex
163 | 	var fileMutex sync.Mutex
164 | 	var err error
165 | 
166 | 	if uploadId == "" {
167 | 		inputCreate := &s3.CreateMultipartUploadInput{
168 | 			Bucket: aws.String(fileInfo.ToBucket),
169 | 			Key:    aws.String(fileInfo.ToKey),
170 | 		}
171 | 		if to.storageClass != "" {
172 | 			inputCreate.StorageClass = aws.String(to.storageClass)
173 | 		}
174 | 		if to.ACL != "" {
175 | 			inputCreate.ACL = aws.String(to.ACL)
176 | 		}
177 | 		if fileInfo.Others.ContentType != nil && *fileInfo.Others.ContentType != "" {
178 | 			inputCreate.ContentType = fileInfo.Others.ContentType
179 | 		}
180 | 		if cfg.TransferMetadata {
181 | 			inputCreate.Metadata = fileInfo.Others.Metadata
182 | 			inputCreate.ContentEncoding = fileInfo.Others.ContentEncoding
183 | 			inputCreate.CacheControl = fileInfo.Others.CacheControl
184 | 			inputCreate.ContentLanguage = fileInfo.Others.ContentLanguage
185 | 			inputCreate.ContentDisposition = fileInfo.Others.ContentDisposition
186 | 		}
187 | 		resp, err := to.svc.CreateMultipartUpload(inputCreate)
188 | 		if err != nil {
189 | 			log.Println("Failed to create multipart upload", fileInfo.ToBucket, fileInfo.ToKey, err)
190 | 			return err
191 | 		}
192 | 		uploadId = *resp.UploadId
193 | 	} else {
194 | 		partnumberList, err = checkPartnumberList(to.svc, fileInfo.ToBucket, fileInfo.ToKey, uploadId)
195 | 		if err != nil {
196 | 			log.Println("Failed to get part number list", fileInfo.ToBucket, fileInfo.ToKey, err)
197 | 			_ = partnumberList
198 | 			return err
199 | 		}
200 | 	}
201 | 
202 | 	indexList, chunkSizeAuto := split(fileInfo, cfg.ChunkSize)
203 | 
204 | 	var wg2 sync.WaitGroup
205 | 	for i, offset := range indexList {
206 | 		// 检查i是否在partnumberList里面
207 | 		found := false
208 | 		for _, value := range partnumberList {
209 | 			if int64(i+1) == value.PartNumber {
210 | 				found = true
211 | 				break
212 | 			}
213 | 		}
214 | 		// 如果已经在partnumberList则跳过到下一个i
215 | 		if found {
216 | 			continue
217 | 		}
218 | 
219 | 		// 不在partnumberList，上传该part
220 | 		size := chunkSizeAuto
221 | 		if offset+chunkSizeAuto > fileInfo.Size { // 如果是最后一个分片则给实际size
222 | 			size = fileInfo.Size - offset
223 | 		}
224 | 		partInfo := PartInfo{
225 | 			ToKey:      fileInfo.ToKey,
226 | 			ToBucket:   fileInfo.ToBucket,
227 | 			FromKey:    fileInfo.FromKey,
228 | 			FromBucket: fileInfo.FromBucket,
229 | 			PartNumber: int64(i + 1),
230 | 			Size:       size,
231 | 			Offset:     offset,
232 | 			TotalParts: int64(len(indexList)),
233 | 		}
234 | 
235 | 		semPart.Acquire(context.Background(), 1) //从线程池中获取，没有线程可用了就阻塞等待
236 | 		atomic.AddInt32(&runningGoroutines, 1)   //线程计数
237 | 		wg2.Add(1)
238 | 		if fileInfo.File != nil {
239 | 			go uploadPart(to.svc, partInfo, &wg2, semPart, uploadId, &partnumberList, &partnumberListMutex, fileInfo.File, &fileMutex)
240 | 		} else {
241 | 			// s3tos3 part
242 | 			go transferPart(from, to, partInfo, &wg2, semPart, uploadId, &partnumberList, &partnumberListMutex)
243 | 		}
244 | 
245 | 	}
246 | 	wg2.Wait()
247 | 	if len(indexList) == len(partnumberList) {
248 | 		err := completeUpload(to.svc, uploadId, fileInfo.ToBucket, fileInfo.ToKey, &partnumberList)
249 | 		if err != nil {
250 | 			log.Println("Failed to complete upload", err, fileInfo.ToBucket, fileInfo.ToKey)
251 | 			return err
252 | 		}
253 | 		// Success complete upload
254 | 	} else {
255 | 		log.Println("Failed to complete upload, len(indexList) != len(partnumberList)", fileInfo.ToBucket, fileInfo.ToKey, len(indexList), len(partnumberList))
256 | 		return errors.New("failed to complete upload, len(indexList) != len(partnumberList)")
257 | 	}
258 | 	return nil
259 | }
260 | 
261 | func uploadPart(svc *s3.S3, partInfo PartInfo, wg *sync.WaitGroup, sem *semaphore.Weighted, uploadId string, partnumberList *[]PartInfo, partnumberListMutex *sync.Mutex, file *os.File, fileMutex *sync.Mutex) error {
262 | 	defer wg.Done()
263 | 	defer sem.Release(1)
264 | 	defer atomic.AddInt32(&runningGoroutines, -1)
265 | 	// log.Printf("-->Uploading s3://%s, part:%d/%d, runningGoroutines: %d\n", path.Join(partInfo.ToBucket, partInfo.ToKey), partInfo.PartNumber, partInfo.TotalParts, runningGoroutines)
266 | 
267 | 	// 创建一个分片大小的缓冲区
268 | 	fileMutex.Lock()
269 | 	file.Seek(partInfo.Offset, 0) // 定位到需要读取的文件部分
270 | 	buffer := make([]byte, partInfo.Size)
271 | 	_, err := io.ReadFull(file, buffer)
272 | 	if err != nil {
273 | 		log.Println("Failed to read full buffer from file", partInfo.ToBucket, partInfo.ToKey, partInfo.PartNumber, err)
274 | 		return err
275 | 	}
276 | 	fileMutex.Unlock()
277 | 
278 | 	// Upload part S3 API Call
279 | 	err = uploadPartAction(buffer, partInfo, svc, uploadId, partnumberList, partnumberListMutex)
280 | 	if err != nil {
281 | 		return err
282 | 	}
283 | 	return nil
284 | }
285 | 
286 | func uploadPartAction(buff []byte, partInfo PartInfo, svc *s3.S3, uploadId string, partnumberList *[]PartInfo, partnumberListMutex *sync.Mutex) error {
287 | 	log.Printf("-->Uploading part s3://%s, part:%d/%d, runningGoroutines: %d\n", path.Join(partInfo.ToBucket, partInfo.ToKey), partInfo.PartNumber, partInfo.TotalParts, runningGoroutines)
288 | 	// 计算分片数据的MD5哈希值
289 | 	md5Hash := md5.Sum(buff)
290 | 	md5Str := base64.StdEncoding.EncodeToString(md5Hash[:])
291 | 
292 | 	// 上传分片
293 | 	result, err := svc.UploadPart(&s3.UploadPartInput{
294 | 		Bucket:        aws.String(partInfo.ToBucket),
295 | 		Key:           aws.String(partInfo.ToKey),
296 | 		PartNumber:    aws.Int64(int64(partInfo.PartNumber)),
297 | 		UploadId:      aws.String(uploadId),
298 | 		Body:          bytes.NewReader(buff),
299 | 		ContentLength: aws.Int64(int64(partInfo.Size)),
300 | 		ContentMD5:    aws.String(md5Str),
301 | 	})
302 | 	if err != nil {
303 | 		log.Printf("Failed to upload part s3://%s, part:%d/%d, err: %v\n", path.Join(partInfo.ToBucket, partInfo.ToKey), partInfo.PartNumber, partInfo.TotalParts, err)
304 | 		return nil
305 | 	}
306 | 	partnumberListMutex.Lock()
307 | 	*partnumberList = append(*partnumberList, PartInfo{
308 | 		PartNumber: partInfo.PartNumber,
309 | 		Etag:       *result.ETag,
310 | 	})
311 | 	partnumberListMutex.Unlock()
312 | 	log.Printf("===Uploaded part s3://%s, part:%d/%d\n", path.Join(partInfo.ToBucket, partInfo.ToKey), partInfo.PartNumber, partInfo.TotalParts)
313 | 	return nil
314 | }
315 | 
316 | func completeUpload(svc *s3.S3, uploadId, bucket, key string, partnumberList *[]PartInfo) error {
317 | 	completedParts := []*s3.CompletedPart{}
318 | 	var i int64
319 | 	for i = 1; i <= int64(len(*partnumberList)); i++ {
320 | 		for _, partNumber := range *partnumberList {
321 | 			if i == partNumber.PartNumber {
322 | 				completedParts = append(completedParts, &s3.CompletedPart{
323 | 					ETag:       &partNumber.Etag,
324 | 					PartNumber: &partNumber.PartNumber,
325 | 				})
326 | 				break
327 | 			}
328 | 		}
329 | 	}
330 | 
331 | 	_, err := svc.CompleteMultipartUpload(&s3.CompleteMultipartUploadInput{
332 | 		Bucket:   aws.String(bucket),
333 | 		Key:      aws.String(key),
334 | 		UploadId: aws.String(uploadId),
335 | 		MultipartUpload: &s3.CompletedMultipartUpload{
336 | 			Parts: completedParts,
337 | 		},
338 | 	})
339 | 	return err
340 | }
341 | 


--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"database/sql"
  6 | 	"fmt"
  7 | 	"log"
  8 | 	"math"
  9 | 	"os"
 10 | 	"strings"
 11 | 	"time"
 12 | 
 13 | 	"github.com/google/uuid"
 14 | )
 15 | 
 16 | func split(fileInfo FileInfo, chunkSize int64) (indexList []int64, actualChunkSize int64) {
 17 | 	partNumber := int64(1)
 18 | 	indexList = []int64{0}
 19 | 
 20 | 	if int64(math.Ceil(float64(fileInfo.Size)/float64(chunkSize))) > 10000 {
 21 | 		chunkSize = fileInfo.Size/10000 + 1024 // 对于大于10000分片的大文件，自动调整Chunksize
 22 | 	}
 23 | 
 24 | 	for chunkSize*partNumber < fileInfo.Size { // 如果刚好是"="，则无需再分下一part，所以这里不能用"<="
 25 | 		indexList = append(indexList, chunkSize*partNumber)
 26 | 		partNumber += 1
 27 | 	}
 28 | 	return indexList, chunkSize
 29 | }
 30 | 
 31 | func withRetries(fn RetryFunc) error {
 32 | 	var err error
 33 | 	for i := 0; i < cfg.MaxRetries; i++ {
 34 | 		err = fn()
 35 | 		if err == nil {
 36 | 			break
 37 | 		}
 38 | 		log.Println("Failed to execute function: ", err, ". Retrying...")
 39 | 		time.Sleep(time.Duration(int64(math.Pow(2, float64(i)))) * time.Second)
 40 | 	}
 41 | 	return err
 42 | }
 43 | 
 44 | func contains(s []int, e int) bool {
 45 | 	for _, a := range s {
 46 | 		if a == e {
 47 | 			return true
 48 | 		}
 49 | 	}
 50 | 	return false
 51 | }
 52 | 
 53 | func ByteCountSI(b int64) string {
 54 | 	const unit = 1024
 55 | 	if b < unit {
 56 | 		return fmt.Sprintf("%dBytes", b)
 57 | 	}
 58 | 	div, exp := int64(unit), 0
 59 | 	for n := b / unit; n >= unit; n /= unit {
 60 | 		div *= unit
 61 | 		exp++
 62 | 	}
 63 | 	return fmt.Sprintf("%.1f%cBytes", float64(b)/float64(div), "KMGTPE"[exp])
 64 | }
 65 | 
 66 | func getIgnoreList() []*string {
 67 | 	log.Printf("Checking ignore files list in %s\n", cfg.IgnoreListPath)
 68 | 	ignoreList := []*string{}
 69 | 
 70 | 	_, err := os.Stat(cfg.IgnoreListPath)
 71 | 	if err != nil {
 72 | 		if os.IsNotExist(err) {
 73 | 			log.Printf("No ignore list in path %s\n", cfg.IgnoreListPath)
 74 | 		} else {
 75 | 			log.Println(err)
 76 | 		}
 77 | 	} else {
 78 | 		file, err := os.Open(cfg.IgnoreListPath)
 79 | 		if err != nil {
 80 | 			log.Println(err)
 81 | 		}
 82 | 		defer file.Close()
 83 | 
 84 | 		scanner := bufio.NewScanner(file)
 85 | 		for scanner.Scan() {
 86 | 			prefix := strings.TrimPrefix(scanner.Text(), "/")
 87 | 			ignoreList = append(ignoreList, &prefix)
 88 | 		}
 89 | 		if err := scanner.Err(); err != nil {
 90 | 			log.Println(err)
 91 | 		}
 92 | 		log.Printf("Found ignore files list with prefix Length: %d, in %s", len(ignoreList), cfg.IgnoreListPath)
 93 | 	}
 94 | 	return ignoreList
 95 | }
 96 | 
 97 | func isIgnored(key string, ignoreList []*string) bool {
 98 | 	for _, prefix := range ignoreList {
 99 | 		if strings.HasPrefix(key, *prefix) {
100 | 			return true
101 | 		}
102 | 	}
103 | 	return false
104 | }
105 | 
106 | func getDatabase() (*sql.DB, error) {
107 | 	var database *sql.DB
108 | 	var err error
109 | 	err = withRetries(func() error {
110 | 		database, err = sql.Open("sqlite3", cfg.DBPath)
111 | 		if err != nil {
112 | 			fmt.Println("Failed to connect to sqlite3", err)
113 | 			return err
114 | 		}
115 | 		statement, err := database.Prepare("CREATE TABLE IF NOT EXISTS download (ID TEXT PRIMARY KEY, key TEXT, bucket TEXT, part INT)")
116 | 		if err != nil {
117 | 			fmt.Println("Failed to prepare getDatabase statement: ", err)
118 | 			return err
119 | 		}
120 | 		_, err = statement.Exec()
121 | 		if err != nil {
122 | 			return err
123 | 		}
124 | 		return nil
125 | 	})
126 | 	if err != nil {
127 | 		return nil, err
128 | 	}
129 | 	return database, nil
130 | }
131 | 
132 | func recordDownloadPart(partInfo PartInfo) {
133 | 	err := withRetries(func() error {
134 | 		database, err := getDatabase()
135 | 		if err != nil {
136 | 			fmt.Println("Failed to get sqlite3 database", err)
137 | 			return err
138 | 		}
139 | 		defer database.Close()
140 | 		uuid, err := uuid.NewRandom()
141 | 		if err != nil {
142 | 			return err
143 | 		}
144 | 		statement, err := database.Prepare("INSERT INTO download (ID, key, bucket, part) VALUES (?, ?, ?, ?)")
145 | 		if err != nil {
146 | 			fmt.Println("Failed to prepare recordDownloadPart statement: ", err)
147 | 			return err
148 | 		}
149 | 		_, execErr := statement.Exec(uuid, partInfo.FromKey, partInfo.FromBucket, partInfo.PartNumber)
150 | 		if execErr != nil {
151 | 			fmt.Println("Failed to execute recordDownloadPart statement: ", execErr, ". Retrying...")
152 | 		}
153 | 		return execErr
154 | 	})
155 | 	if err != nil {
156 | 		fmt.Println("Failed to execute recordDownloadPart statement after retries: ", err)
157 | 		return
158 | 	}
159 | }
160 | 
161 | func getDownloadedParts(fileInfo FileInfo) ([]int, error) {
162 | 	var partnumberList []int
163 | 	err := withRetries(func() error {
164 | 		database, err := getDatabase()
165 | 		if err != nil {
166 | 			fmt.Println("Failed to get sqlite3 database", err)
167 | 			return err
168 | 		}
169 | 		defer database.Close()
170 | 		partnumberList = []int{}
171 | 		rows, err := database.Query("SELECT part FROM download WHERE key = ? AND bucket = ? ORDER BY part ASC", fileInfo.FromKey, fileInfo.FromBucket)
172 | 		if err != nil {
173 | 			fmt.Println("Failed to prepare getDownloadedParts statement: ", err)
174 | 			return err
175 | 		}
176 | 		defer rows.Close()
177 | 		var part int
178 | 		for rows.Next() {
179 | 			err := rows.Scan(&part)
180 | 			if err != nil {
181 | 				fmt.Println("Failed to scan row: ", err)
182 | 				return err
183 | 			}
184 | 			partnumberList = append(partnumberList, part)
185 | 		}
186 | 		if err = rows.Err(); err != nil {
187 | 			fmt.Println("Rows iteration error: ", err)
188 | 			return err
189 | 		}
190 | 		return nil
191 | 	})
192 | 	if err != nil {
193 | 		return nil, err
194 | 	}
195 | 	return partnumberList, nil
196 | }
197 | 
198 | func deleteDownloadParts(fileInfo FileInfo) error {
199 | 	err := withRetries(func() error {
200 | 		database, err := getDatabase()
201 | 		if err != nil {
202 | 			fmt.Println("Failed to get sqlite3 database: ", err)
203 | 			return err
204 | 		}
205 | 		defer database.Close()
206 | 		statement, err := database.Prepare("DELETE FROM download WHERE key = ? AND bucket = ?")
207 | 		if err != nil {
208 | 			fmt.Println("Failed to prepare deleteDownloadParts statement: ", err)
209 | 			return err
210 | 		}
211 | 		_, err = statement.Exec(fileInfo.FromKey, fileInfo.FromBucket)
212 | 		if err != nil {
213 | 			fmt.Println("Failed to execute deleteDownloadParts statement: ", err)
214 | 			return err
215 | 		}
216 | 		return nil
217 | 	})
218 | 	return err
219 | }
220 | 


--------------------------------------------------------------------------------