├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.en.md ├── README.md ├── check.go ├── download.go ├── go.mod ├── go.sum ├── http_download.go ├── img ├── arch-cn1.png ├── arch-cn2.png ├── arch-en1.png └── arch-en2.png ├── main.go ├── s3tos3.go ├── sqs2trans.go ├── upload.go └── util.go /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .* 3 | __* 4 | 5 | *.yaml 6 | *.txt 7 | *.db 8 | *.log 9 | s3trans 10 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README.en.md: -------------------------------------------------------------------------------- 1 | 2 | # Amazon S3 Resumable Transfer V2 (Amazon S3 断点续传传输 V2) 3 | 4 | 中文 README: [README.md](README.md) 5 | 6 | Multi-threaded breakpoint resumption, suitable for batch uploading/downloading large files to/from S3 and local/cross-object storage migration. It supports Amazon S3, Alibaba Cloud OSS, Tencent Cloud COS, Google Cloud Storage, Huawei Cloud OBS, and other object storage services compatible with the S3 API. In Version 2, the same application can be configured for various scenarios through configuration: single-machine uploading, single-machine downloading, deployed as a cluster version for scanning source files, or as a distributed transmission worker node in a cluster. It has been refactored with Go for improved performance and supports a range of extended features: exclusion list, source no-sign-request, source request-payer, destination storage-class, destination ACL, and metadata transfer. 7 | 8 | ![img](./img/arch-en1.png) 9 | ![img](./img/arch-en2.png) 10 | 11 | ## Features 12 | 13 | * Supports multi-threaded concurrent transfers to multiple object storage systems, with resumable transfers, automatic retries, and concurrent multi-file tasks to fully utilize bandwidth. An optimized flow control mechanism is implemented. In a cluster test (10*m5.large instances), 1.2TB of data was migrated from us-east-1 to cn-northwest-1 in just 1 hour. In another single-machine bandwidth test, using an m6i.8xlarge EC2 instance (num-workers 16), a sustained transfer speed of 12Gbps was achieved between two S3 buckets in the same region. 14 | 15 | * Supports sources and destinations including local directories or files, Amazon S3, Alibaba OSS, Tencent COS, Google GCS, and other object storage systems. No need to distinguish between work modes; simply specify the source and destination URLs or local paths, and the transfer will automatically start. Can handle single files or objects, entire directories, S3 buckets/prefixes, etc. 16 | 17 | * Data is transferred through intermediate nodes in single chunk form without being written to disk, saving time and improving security. Supports transfers from 0 bytes up to TB-level sizes. 18 | 19 | * Allows setting various object storage classes for the destination, such as STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW 20 | 21 | * Supports specifying the destination S3 ACL: private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control 22 | 23 | * Supports setting the source object storage as no-sign-request and request-payer. 24 | 25 | * Supports source objects as Presigned URLs or URL lists (please ensure the correct Region is specified when generating Presigned URLs). 26 | 27 | * Supports retrieving and copying metadata from the source object storage to the destination. 28 | 29 | * Automatically compares file names and sizes between source and destination buckets, transferring only mismatched files. By default, it lists and transfers simultaneously, fetching destination object information and transferring one by one, providing an immediate transfer start after entering the command (similar to AWS CLI). Optionally, the -l parameter can be used to list all destination objects before transferring, which is more efficient and reduces request costs. 30 | 31 | * Version 2 now supports multi-threaded parallel listing, significantly speeding up the listing process for buckets with large numbers of objects. For example, a bucket with 30 million objects that would normally take over 90 minutes to list (e.g., with aws s3 ls) can now be listed in just 1-2 minutes using 64 concurrent threads (16 vCPU). 32 | 33 | * Supports saving the compared task list to a file, saving logs of tasks sent to SQS to a file, setting an exclusion list to skip transferring keys or local paths matching the list, a DRYRUN mode to compare sources and destinations without transferring data, and a mode to overwrite destinations without comparison. 34 | 35 | * Supports setting a resumable transfer threshold, parallel thread count, request timeout, maximum retry count, and an option to ignore confirmation prompts and execute directly. 36 | 37 | ## Usage 38 | 39 | ### Install Go Runtime 40 | 41 | For first time use, install Golang runtime, example for Linux: 42 | 43 | ```shell 44 | sudo yum install go git -y 45 | git clone https://github.com/aws-samples/amazon-s3-resumable-upload 46 | ``` 47 | 48 | For China regions, use go proxy to speed up downloading go packages, add: 49 | 50 | ```go 51 | go env -w GOPROXY=https://goproxy.cn,direct 52 | ``` 53 | 54 | ### Compile Go Code 55 | 56 | ```shell 57 | cd amazon-s3-resumable-upload 58 | go build . # downloads dependencies and compiles 59 | ``` 60 | 61 | Use ./s3trans -h to see help 62 | 63 | ### Quick Start 64 | 65 | * Download S3 file to local: 66 | 67 | ```shell 68 | ./s3trans s3://bucket-name/prefix /local/path 69 | # Above uses default AWS profile in ~/.aws/credentials, or IAM Role if on EC2. To specify profile for source S3: 70 | ./s3trans s3://bucket-name/prefix /local/path --from_profile=source_profile 71 | ``` 72 | 73 | * Upload local file to S3: 74 | 75 | ```shell 76 | ./s3trans /local/path s3://bucket-name/prefix 77 | # Above uses default AWS profile in ~/.aws/credentials, or IAM Role if on EC2. To specify profile for destination S3: 78 | ./s3trans /local/path s3://bucket-name/prefix --to_profile=dest_profile 79 | ``` 80 | 81 | * S3 to S3, region is auto detected if not specified: 82 | 83 | ```shell 84 | ./s3trans s3://bucket-name/prefix s3://bucket-name/prefix --to_profile=dest_profile 85 | # Above from_profile not set uses default or EC2 IAM Role. Can also specify both: 86 | ./s3trans s3://bucket-name/prefix s3://bucket-name/prefix --from_profile=source_profile --to_profile=dest_profile 87 | ``` 88 | 89 | * For non-AWS S3 compatible storage, specify endpoint: 90 | 91 | ```shell 92 | ./s3trans s3://bucket-gcs-test s3://bucket-virginia --from_profile=gcs_profile --to_profile=aws_profile --from_endpoint=https://storage.googleapis.com 93 | # Can use short names for endpoints, e.g. --from_endpoint=google_gcs, also supports: ali_oss, tencent_cos, azure_blob(TODO: azure) 94 | ``` 95 | 96 | * -l to list target before transfer (less API calls but slower start) 97 | * -n (n is NumWorkers) to specify concurrency for listing and transfers. Max concurrent objects is n, max concurrent parts per object is 4n, max concurrent listing is 4n. Recommend n <= vCPU number 98 | * -y to auto confirm prompt 99 | 100 | ```shell 101 | ./s3trans C:\Users\Administrator\Downloads\test\ s3://huangzb-virginia/win2/ --to-profile sin -l -n 8 -y 102 | ``` 103 | 104 | ## Download from Presign URL or URL list 105 | 106 | Download a single presigned URL (presigned URL) concurrently: 107 | 108 | ```shell 109 | ./s3trans "https://your_bucket.s3.region.amazonaws.com/prefix/filename?X-Amz-Algorithm=xxxx&&X-Amz-Credential=xxxx&&X-Amz-Date=xxxx&&X-Amz-Expires=xxxx&X-Amz-SignedHeaders=host&X-Amz-Signature=xxxx" /localpath_download_to/ 110 | ``` 111 | 112 | Download from a list of URLs. In the example below, the URL list file is named list_file.txt, with each line containing a presigned URL: 113 | 114 | ```shell 115 | ./s3trans /mypath/list_file.txt /localpath_download_to/ 116 | --work-mode HTTP_DOWNLOAD_LIST 117 | ``` 118 | 119 | ## More usage help 120 | 121 | ```shell 122 | ./s3trans -h 123 | 124 | s3trans transfers data from source to target. 125 | ./s3trans FROM_URL TO_URL [OPTIONS] 126 | FROM_URL: The url of data source, e.g. /home/user/data or s3://bucket/prefix 127 | TO_URL: The url of data transfer target, e.g. /home/user/data or s3://bucket/prefix 128 | For example: 129 | ./s3trans s3://bucket/prefix s3://bucket/prefix -from_profile sin -to_profile bjs 130 | ./s3trans s3://bucket/prefix /home/user/data -from_profile sin 131 | 132 | Usage: 133 | s3trans FROM_URL TO_URL [flags] 134 | 135 | Flags: 136 | --acl string The TARGET S3 bucket ACL, private means only the object owner can read&write, e.g. private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control 137 | --from-endpoint string The endpoint of data source, e.g. https://storage.googleapis.com; https://oss-.aliyuncs.com; https://cos..myqcloud.com . If AWS s3 or local path, no need to specify this. 138 | --from-profile string The AWS profile in ~/.aws/credentials of data source 139 | --force-path-style Set this to true to force the request to use path-style addressing See http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html 140 | --from-region string The region of data transfer source, e.g. cn-north-1. If no specified, the region will be auto detected with the credentials you provided in profile. 141 | -h, --help help for s3trans 142 | --http-timeout int API request timeout (seconds) (default 30) 143 | -l, --list-target List the TARGET S3 bucket, compare exist objects BEFORE transfer. List is more efficient than head each object to check if it exists, but transfer may start slower because it needs to wait for listing all objects to compare. To mitigate this, this app leverage Concurrency Listing for fast list; If no list-target para, transfer without listing the target S3 bucket, but before transfering each object, head each target object to check, this costs more API call, but start faster. 144 | --max-retries int API request max retries (default 5) 145 | --no-sign-request The SOURCE bucket is not needed to sign the request 146 | -n, --num-workers int NumWorkers*1 for concurrency files; NumWorkers*4 for parts of each file and for listing target bucket; Recommend NumWorkers <= vCPU number (default 4) 147 | --request-payer The SOURCE bucket requires requester to pay, set this 148 | --resumable-threshold int When the file size (MB) is larger than this value, the file will be resumable transfered. (default 50) 149 | -s, --skip-compare If True, skip to compare the name and size between source and target S3 object. Just overwrite all objects. No list target nor head target object to check if it already exists. 150 | --sqs-profile string The SQS queue leverage which AWS profile in ~/.aws/credentials 151 | --sqs-url string The SQS queue URL to send or consume message from, e.g. https://sqs.us-east-1.amazonaws.com/my_account/my_queue_name 152 | --storage-class string The TARGET S3 bucket storage class, e.g. STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW or others of S3 compatibale 153 | --to-endpoint string The endpoint of data transfer target, e.g. https://storage.googleapis.com . If AWS s3 or local path, no need to specify this. 154 | --to-profile string The AWS profile in ~/.aws/credentials of data transfer target 155 | --to-region string The region of data transfer target, e.g. us-east-1. If no specified, the region will be auto detected with the credentials you provided in profile. 156 | --transfer-metadata If True, get metadata from source S3 bucket and upload the metadata to target object. This costs more API calls. 157 | --work-mode string SQS_SEND | SQS_CONSUME | DRYRUN | HTTP_DOWNLOAD_LIST; SQS_SEND means listing source FROM_URL S3 and target TO_URL S3 to compare and send message to SQS queue, SQS_CONSUME means consume message from SQS queue and transfer objects from FROM_URL S3 to TO_URL S3; DRYRUN means only count the objects and sizes comparing delta list of FROM_URL S3 and TO_URL S3, no transfer; HTTP_DOWNLOAD_LIST, from a list file with lines of presign url 158 | -y, --y Ignore waiting for confirming command 159 | ``` 160 | 161 | ## License 162 | 163 | This library is licensed under the MIT-0 License. See the LICENSE file. 164 | 165 | ****** 166 | Author: Huang, Zhuobin (James) 167 | ****** 168 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon S3 Resumable Transfer V2 (Amazon S3 断点续传传输 V2) 2 | 3 | English README: [README.en.md](README.en.md) 4 | 5 | 多线程断点续传,适合批量的大文件S3上传/下载本地/跨对象存储迁移,支持Amazon S3, Ali OSS, Tencent COS, Google GCS, HuaweiCloud 等兼容S3 API的对象存储 6 | 本 Version 2 在同一个应用通过配置即可用做各种场景:单机的上传,单机的下载,部署为集群版的扫描源文件,或作为集群版的分布式传输工作节点;用Golang做了重构,提高性能;支持了一系列扩展功能:排除列表、源no-sign-request、源request-payer、目的storage-class、目的ACL、传输 Metadata 等。 7 | 8 | ![img](./img/arch-cn1.png) 9 | ![img](./img/arch-cn2.png) 10 | 11 | ## 功能 12 | 13 | * 多线程并发传输到多种对象存储,断点续传,自动重传。多文件任务并发,充分利用带宽。优化的流控机制。在一次集群测试中(10台m5.large),迁移1.2TB数据从 us-east-1 到 cn-northwest-1 只用1小时。在另一个单机带宽的测试中,同 Region 的两个 S3 用 m6i.8xlarge EC2 传输 (num-workers 16),跑出了单机持续 12Gbps 的传输速度。 14 | 15 | * 支持的源和目的地:本地目录或单个文件, Amazon S3, Ali OSS, Tencent COS, Google GCS 等对象存储。无需区分工作模式,指定好源和目的URL或本地路径即可自动识别并开始传输。可以是单个文件或对象,或整个目录,或S3桶/前缀等URL。 16 | 17 | * 传输数据只以单个分片的形式过中转节点的内存,不落盘到节点,节省时间且更安全。可支撑 0 Size 至 TB 级别 。 18 | 19 | * 支持设置目的地的各种对象存储级别,如:STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW 20 | 21 | * 支持指定目的S3的ACL: private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control 22 | 23 | * 支持设置源对象存储是no-sign-request和request-payer 24 | 25 | * 支持源对象是 Presign URL 或 URL 列表(注意生成Presign URL的时候指定 Bucket 正确的 Region) 26 | 27 | * 支持获取源对象存储的 Metadata 也复制到目的对象存储。但要注意这个需要每个对象都Head去获取一次,会影响性能和增加对源S3的请求次数费用。 28 | 29 | * 自动对比源/目的桶的文件名和大小,不一致的才传输。默认是一边List,一边传输,即逐个获取目的对象信息对比一个就传输一个,这样使用体验是输入命令之后就立马启动传输(类似AWS CLI);可选设置 -l 参数,为List目的对象列表之后再进行传输,因为List比逐个Head对比效率更高,也节省请求次数的费用。 30 | 31 | * 本次 Version 2 支持了多线程并行 List ,对于对象数量很多的情况,可以更快完成List。例如3千万对象的桶,如果按正常 List(例如 aws s3 ls)要90分钟以上,而现在在使用64并发的情况(16vCPU)下缩减到只有 1 到 2 分钟。 32 | 33 | * 支持把对比扫描出来的任务列表存入文件;支持把已发送到SQS的日志存入文件;支持设置排除列表,如果数据源Key或源本地路径符合排除列表的则不传输;支持DRYRUN模式,只比较源和目的桶,统计数量和Size,不传输数据;支持不做对比不检查目的对象,直接覆盖的模式。 34 | 35 | * 支持设置断点续传阈值;设置并行线程数;设置请求超时时间;设置最大重试次数;支持设置是否忽略确认命令,直接执行; 36 | 37 | ## 使用说明 38 | 39 | ### 安装Go运行环境 40 | 41 | 首次使用需要安装Golang运行环境,以Linux为例: 42 | 43 | ```shell 44 | sudo yum install go git -y 45 | ``` 46 | 47 | 如果在中国区,可通过go代理来下载go依赖包,则多运行一句代理设置: 48 | 49 | ```go 50 | go env -w GOPROXY=https://goproxy.cn,direct 51 | ``` 52 | 53 | ### 下载和编译本项目的Go代码 54 | 55 | ```shell 56 | git clone https://github.com/aws-samples/amazon-s3-resumable-upload 57 | cd amazon-s3-resumable-upload 58 | go build . # 下载依赖包并编译程序 59 | ``` 60 | 61 | 可使用 ./s3trans -h 获取帮助信息 62 | 63 | ### 使用 64 | 65 | * 下载S3文件到本地: 66 | 67 | ```shell 68 | ./s3trans s3://bucket-name/prefix /local/path 69 | # 以上是使用默认AWS profile in ~/.aws/credentials,如果是EC2且没有配置 profile 而是使用IAM Role,需指定一下 Region 70 | ./s3trans s3://bucket-name/prefix /local/path --from-region=my_region 71 | # 如果要指定S3的profile则如下: 72 | ./s3trans s3://bucket-name/prefix /local/path --from-profile=source_profile 73 | ``` 74 | 75 | * 上传本地文件到S3: 76 | 77 | ```shell 78 | ./s3trans /local/path s3://bucket-name/prefix 79 | # 以上是使用默认AWS profile in ~/.aws/credentials,如果是EC2且没有配置 profile 而是使用IAM Role,需指定一下 Region 80 | ./s3trans /local/path s3://bucket-name/prefix --to-region=my_region 81 | # 如果要指定S3的profile则如下: 82 | ./s3trans /local/path s3://bucket-name/prefix --to-profile=dest_profile 83 | ``` 84 | 85 | * 从S3到S3,如不指定region,则程序会先自动查询Bucket的Region: 86 | 87 | ```shell 88 | ./s3trans s3://bucket-name/prefix s3://bucket-name/prefix --from-profile=source_profile --to-profile=dest_profile 89 | # 如果from-profile不填则获取默认的profile或使用EC2 IAM Role,需指定一下region 90 | ./s3trans s3://bucket-name/prefix s3://bucket-name/prefix --from-region=my_region --to-profile=dest_profile 91 | ``` 92 | 93 | * 对于非AWS的S3兼容存储,则需要指定endpoint 94 | 95 | ```shell 96 | ./s3trans s3://bucket-gcs-test s3://bucket-virginia --from-profile=gcs_profile --to-profile=aws_profile --from-endpoint=https://storage.googleapis.com 97 | # 以上endpoint也可以用简称替换,即:--from-endpoint=google_gcs,还可以是其他简称:ali_oss, tencent_cos, azure_blob(TODO: azure) 98 | ``` 99 | 100 | * -l 指定先List再同步数据(节省请求次数费用,但会增加一次List的时间) 101 | * -n (n 即NumWorkers)指定并行List和并行传输线程数。最大并发对象数为n,每个对象最大并发为2n,List Bucket时最大并发为4n;推荐n <= vCPU numbe 102 | * -y 忽略确认命令,直接执行 103 | 104 | ```shell 105 | ./s3trans C:\Users\Administrator\Downloads\test\ s3://huangzb-virginia/win2/ --to-profile sin -l -n 8 -y 106 | ``` 107 | 108 | * 支持设置排除列表 (--ignore-list-path) 如果数据源的S3 Key或源本地路径符合排除列表的则不传输 109 | 例如,排除列表路径设置为 --ignore-list-path="./ignore-list.txt" 文件内容为: 110 | 111 | ```text 112 | test2/ 113 | test1 114 | ``` 115 | 116 | 则源数据中遇到这些路径都会被跳过,不传输:test2/abc.zip, test1/abc.zip, test1, test1.zip, test2/cde/efg等... 117 | 而这些路径则会正常传输,因为开头Prefix不一致:test3/test1, test3/test2/ 等... 118 | 119 | ## 集群模式 120 | 121 | ### 集群模式的 List 模块使用 122 | 123 | 对比源Bucket/Prefix和目的Bucket/Prefix,把不一致的对象信息写入SQS队列,以便后续的传输节点使用。 124 | 需要指定源S3和目的S3的URL,另还需要指定一个SQS用于发送任务列表,包括SQS的url和能访问这个SQS所用的AWS profile。不指定 sqs profile 则程序会自动从 EC2 IAM Role 获取权限,Region名称会从sqs-url自动提取。 125 | 可选: 126 | 设置把对比扫描出来的任务列表存入文件 --joblist-write-to-filepath; 127 | 设置把SQS发送的日志存入文件 --sqs-log-to-filename 128 | 129 | ```shell 130 | ./s3trans s3://from_bucket/ s3://to_bucket/prefix --from-profile us --to-profile bjs \ 131 | --work-mode SQS_SEND 132 | --sqs-profile us \ 133 | --sqs-url "https://sqs.region.amazonaws.com/my_account_number/sq_queue_sname" \ 134 | --joblist-write-to-filepath "./my_joblist.log" \ 135 | --sqs-log-to-filename "./sqssent.log" \ 136 | -y -l -n 8 137 | ``` 138 | 139 | ### 集群模式的传输节点使用 140 | 141 | 从SQS队列中获取任务列表,然后传输数据。需要指定源S3和目的S3的URL,另还需要指定一个SQS用于发送任务列表,包括SQS的url和能访问这个SQS所用的AWS profile。不指定 sqs profile 则程序会自动从 EC2 IAM Role 获取权限,Region名称会从sqs-url自动提取。 142 | 143 | ```shell 144 | ./s3trans s3://from_bucket/prefix s3://to_bucket/ --from-profile us --to-profile bjs \ 145 | --work-mode SQS_CONSUME 146 | --sqs-profile us \ 147 | --sqs-url "https://sqs.region.amazonaws.com/my_account_number/sq_queue_sname" \ 148 | -y -l -n 8 149 | ``` 150 | 151 | ## 下载从 Presign URL 或 URL 列表 152 | 153 | 多线程并发下载单一个 URL (presign url) 154 | 155 | ```shell 156 | ./s3trans "https://your_bucket.s3.region.amazonaws.com/prefix/filename?X-Amz-Algorithm=xxxx&&X-Amz-Credential=xxxx&&X-Amz-Date=xxxx&&X-Amz-Expires=xxxx&X-Amz-SignedHeaders=host&X-Amz-Signature=xxxx" /localpath_download_to/ 157 | ``` 158 | 159 | 多线程并发按照 URL 列表下载,下例子中 URL 列表文件名为 list_file.txt,文件中每行为一个 presign URL 160 | 161 | ```shell 162 | ./s3trans /mypath/list_file.txt /localpath_download_to/ 163 | --work-mode HTTP_DOWNLOAD_LIST 164 | ``` 165 | 166 | ## 其他使用帮助 167 | 168 | ./s3trans -h 169 | 170 | s3trans 从源传输数据到目标 171 | ./s3trans FROM_URL TO_URL [OPTIONS] 172 | FROM_URL: 数据源的URL,例如 /home/user/data or s3://bucket/prefix 173 | TO_URL: 传输目标的URL,例如 /home/user/data or s3://bucket/prefix 174 | 175 | Usage: 176 | s3trans FROM_URL TO_URL [flags] 177 | 178 | ```shell 179 | Flags: 180 | --acl string 目标S3桶的ACL,private表示只有对象所有者可以读写,例如 private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control ,不设置则默认根据S3的默认设置,通常是 private 模式 181 | --from-endpoint string 数据源的 API Endpoint 例如 https://storage.googleapis.com; https://oss-shenzhen.aliyuncs.com; https://cos..myqcloud.com 如果是AWS S3或本地路径,无需指定这个 Endpoint 182 | --from-profile string 数据源在~/.aws/credentials中的AWS profile,如果不指定profile则用default profile,如果没有default profile,则需指定region 183 | --force-path-style 设置为true时,可强制请求使用路径样式寻址,而不是域名。参考:http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html 184 | --from-region string 数据源的区域,例如 cn-north-1. 如果未指定,但有设置 profile 则会自动找S3的所在 Region 185 | -h, --help 帮助文档 186 | --http-timeout int API请求超时(秒)(默认30) 187 | -l, --list-target 推荐使用。列出目标S3桶,传输之前先比较现有对象。因为列表方式比逐个对象请求检查是否存在更有效率,只是因为需要等待列出所有对象进行比较,然后再开始传输所以感觉启动较慢。为了缓解这个问题,此应用程序利用多线程并行List,进行快速列表;如果没有设置--list-target参数,就不List目标S3桶了,而是在传输每个对象之前,检查每个目标对象,这会消耗更多API调用,但开始更快;如果完全不希望做对比,直接覆盖,则用下面提到的--skip-compare参数,而不用--list-target了; 188 | --max-retries int API请求最大重试次数(默认5) 189 | --no-sign-request 源桶不需要请求签名(即允许匿名)的情况 190 | -n, --num-workers int NumWorkers x 1 个并发线程传输文件;NumWorkers x 2 每个文件的并发分片同时传输的线程数;NumWorkers x 4 List目标桶的并发线程数;推荐NumWorkers <= vCPU数量(默认4) 191 | --request-payer 源桶要求请求者支付的情况 192 | --resumable-threshold int 当文件大小(MB)大于此值时,使用断点续传。(默认50) 193 | -s, --skip-compare 跳过比较源和目标S3对象的名称和大小。直接覆盖所有对象。不列出目标也不检查目标对象是否已存在。 194 | --sqs-profile string work-mode为SQS_SEND或SQS_CONSUME的场景下,为访问SQS队列使用~/.aws/credentials中的哪个AWS profile,不指定sqs profile则程序会自动从EC2 IAM Role获取权限,Region名称会从sqs-url自动提取。 195 | --sqs-url string work-mode为SQS_SEND或SQS_CONSUME的场景下,指定发送或消费消息的SQS队列URL,例如 https://sqs.us-east-1.amazonaws.com/my_account/my_queue_name 196 | --storage-class string 目标S3桶的存储类,例如 STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW 或其他S3兼容的 197 | --to-endpoint string 数据传输目标的端点,例如 https://storage.googleapis.com . 如果是AWS S3或本地路径,无需指定这个 198 | --to-profile string 数据传输目标在~/.aws/credentials中的AWS profile,如果不指定profile则用default profile,如果没有default profile,则需指定region 199 | --to-region string 数据传输目标的区域,例如 cn-north-1. 如果未指定,但有设置 profile 则会自动找S3的所在 Region 200 | --transfer-metadata 从源S3桶获取元数据并上传到目标对象。这需要每传输一个对象都通过API调用获取源文件元数据。 201 | --work-mode string SQS_SEND | SQS_CONSUME | DRYRUN | HTTP_DOWNLOAD_LIST; SQS_SEND:扫描节点,表示列出源S3和目标S3进行比较,并发送传输任务消息到SQS队列;SQS_CONSUME: 工作节点,表示从SQS队列获取任务消息并从来源S3传输对象到S3; HTTP_DOWNLOAD_LIST:从一个文件中的 presign url 列表(即一个http列表)下载; 202 | -y, --y 忽略等待确认,直接执行;DRYRUN是只比较源和目的桶,统计数量和Size,不传输数据 203 | ``` 204 | 205 | ## 其他说明 206 | 207 | ### S3 触发 SQS 的 Policy示例 208 | 209 | 写入SQS权限:"Service": "s3.amazonaws.com" 210 | 读取SQS权限:EC2 Role 或直接填 AWS Account Number 211 | 212 | ```json 213 | { 214 | "Version": "2008-10-17", 215 | "Id": "__default_policy_ID", 216 | "Statement": [ 217 | { 218 | "Sid": "__owner_statement", 219 | "Effect": "Allow", 220 | "Principal": { 221 | "AWS": "arn:aws:iam::my_account_number:root" 222 | }, 223 | "Action": "SQS:*", 224 | "Resource": "arn:aws:sqs:us-west-2:my_account_number:s3_migration_queque" 225 | }, 226 | { 227 | "Sid": "__sender_statement", 228 | "Effect": "Allow", 229 | "Principal": { 230 | "Service": "s3.amazonaws.com" 231 | }, 232 | "Action": "SQS:SendMessage", 233 | "Resource": "arn:aws:sqs:us-west-2:my_account_number:s3_migration_queque" 234 | }, 235 | { 236 | "Sid": "__receiver_statement", 237 | "Effect": "Allow", 238 | "Principal": { 239 | "AWS": "arn:aws:iam::my_account_number:root" 240 | }, 241 | "Action": [ 242 | "SQS:ChangeMessageVisibility", 243 | "SQS:DeleteMessage", 244 | "SQS:ReceiveMessage" 245 | ], 246 | "Resource": "arn:aws:sqs:us-west-2:my_account_number:s3_migration_queque" 247 | } 248 | ] 249 | } 250 | 251 | ``` 252 | 253 | ### 配置文件 254 | 255 | 如果不使用上面的命令行参数,而使用配置文件,可以在程序运行目录下写一个config.yaml文件,内容如下。然后只需要运行 ./s3trans FROM_URL TO_URL 即可。 256 | 257 | ```yaml 258 | from-profile: "your_from_profile" 259 | to-profile: "your_to_profile" 260 | from-endpoint: "your_from_endpoint" 261 | to-endpoint: "your_to_endpoint" 262 | from-region: "your_from_region" 263 | to-region: "your_to_region" 264 | storage-class: "your_storage_class" 265 | acl: "your_acl" 266 | no-sign-request: false 267 | request-payer: false 268 | db-location: "./your_download_status.db" 269 | list-target: false 270 | skip-compare: false 271 | transfer-metadata: false 272 | http-timeout: 30 273 | max-retries: 5 274 | retry-delay: 5 275 | chunk-size: 5 276 | resumable-threshold: 50 277 | num-workers: 4 278 | y: false 279 | work-mode: "your_work_mode" 280 | sqs-url: "your_sqs_url" 281 | sqs-profile: "your_sqs_profile" 282 | joblist-write-to-filepath: "your_joblist_write_to_filepath" 283 | sqs-log-to-filename: "your_sqs_log_to_filename" 284 | ignore-list-path: "your_ignore_list_path" 285 | ``` 286 | 287 | 还可以把以上配置写入环境变量。 288 | 289 | ## License 290 | 291 | This library is licensed under the MIT-0 License. See the LICENSE file. 292 | 293 | ****** 294 | Author: Huang, Zhuobin (James) 295 | ****** 296 | -------------------------------------------------------------------------------- /check.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "path" 8 | "sync" 9 | 10 | "github.com/aws/aws-sdk-go/aws" 11 | "github.com/aws/aws-sdk-go/aws/awserr" 12 | "github.com/aws/aws-sdk-go/service/s3" 13 | "golang.org/x/sync/semaphore" 14 | ) 15 | 16 | func checkPartnumberList(svc *s3.S3, bucket, key, uploadId string) ([]PartInfo, error) { 17 | var partNumbers []PartInfo 18 | var partNumbersPrint []int64 19 | err := svc.ListPartsPages(&s3.ListPartsInput{ 20 | Bucket: aws.String(bucket), 21 | Key: aws.String(key), 22 | UploadId: aws.String(uploadId), 23 | }, func(page *s3.ListPartsOutput, lastPage bool) bool { 24 | for _, part := range page.Parts { 25 | partNumbers = append(partNumbers, PartInfo{ 26 | PartNumber: *part.PartNumber, 27 | Etag: *part.ETag, 28 | }) 29 | partNumbersPrint = append(partNumbersPrint, *part.PartNumber) 30 | } 31 | return !lastPage 32 | }) 33 | log.Printf(" Exist %d parts on s3://%s :%v\n", len(partNumbersPrint), path.Join(bucket, key), partNumbersPrint) 34 | return partNumbers, err 35 | } 36 | 37 | func checkFileExistHead(svc *s3.S3, tofileInfo FileInfo, multipartUploadsList []*s3.MultipartUpload) (string, error) { 38 | exist, err := headFile(svc, tofileInfo) 39 | if err != nil { 40 | return "", err 41 | } 42 | if exist { 43 | return "NEXT", nil 44 | } 45 | 46 | // 找不到文件,或文件不一致,且要重新传的,查是否有MultipartUpload ID 47 | uploadId, err := checkMultipartUploadId(tofileInfo, multipartUploadsList) 48 | 49 | return uploadId, err 50 | } 51 | 52 | func compareMetaStructs(meta1, meta2 MetaStruct) bool { 53 | if aws.StringValue(meta1.ContentType) != aws.StringValue(meta2.ContentType) || 54 | aws.StringValue(meta1.ContentLanguage) != aws.StringValue(meta2.ContentLanguage) || 55 | aws.StringValue(meta1.ContentEncoding) != aws.StringValue(meta2.ContentEncoding) || 56 | aws.StringValue(meta1.CacheControl) != aws.StringValue(meta2.CacheControl) || 57 | aws.StringValue(meta1.ContentDisposition) != aws.StringValue(meta2.ContentDisposition) { 58 | return false 59 | } 60 | if len(meta1.Metadata) != len(meta2.Metadata) { 61 | return false 62 | } 63 | for k, v := range meta1.Metadata { 64 | if v2, ok := meta2.Metadata[k]; !ok || aws.StringValue(v) != aws.StringValue(v2) { 65 | return false 66 | } 67 | } 68 | return true 69 | } 70 | 71 | func headFile(svc *s3.S3, tofileInfo FileInfo) (bool, error) { 72 | log.Printf(" Call HEAD to compare target s3://%s\n", path.Join(tofileInfo.ToBucket, tofileInfo.ToKey)) 73 | input := &s3.HeadObjectInput{ 74 | Bucket: aws.String(tofileInfo.ToBucket), 75 | Key: aws.String(tofileInfo.ToKey), 76 | } 77 | result, err := svc.HeadObject(input) 78 | // If Not Exist 79 | if err != nil { 80 | if aerr, ok := err.(awserr.RequestFailure); ok { 81 | if aerr.StatusCode() == 404 { 82 | return false, nil 83 | } 84 | } 85 | return false, err 86 | } 87 | // If Exist check size 88 | if *result.ContentLength == tofileInfo.Size { 89 | // If Exist and need to check metadata 90 | if cfg.TransferMetadata { 91 | log.Printf(" Comparing metadata of target s3://%s\n", path.Join(tofileInfo.ToBucket, tofileInfo.ToKey)) 92 | resultStruct := MetaStruct{ 93 | Metadata: result.Metadata, 94 | ContentType: result.ContentType, 95 | ContentLanguage: result.ContentLanguage, 96 | ContentEncoding: result.ContentEncoding, 97 | CacheControl: result.CacheControl, 98 | ContentDisposition: result.ContentDisposition, 99 | } 100 | 101 | if !compareMetaStructs(resultStruct, tofileInfo.Others) { 102 | log.Printf("...Metadata not match, upload target s3://%s\n", path.Join(tofileInfo.ToBucket, tofileInfo.ToKey)) 103 | return false, nil 104 | } 105 | 106 | } 107 | return true, nil 108 | } 109 | return false, nil 110 | } 111 | 112 | func checkFileExistList(tofileInfo FileInfo, targetObjectList []*s3.Object, multipartUploadsList []*s3.MultipartUpload) (string, error) { 113 | for _, f := range targetObjectList { 114 | if *f.Key == tofileInfo.ToKey && *f.Size == tofileInfo.Size { 115 | return "NEXT", nil // 文件完全相同 116 | } 117 | } 118 | 119 | // 找不到文件,或文件不一致,且要重新传的,查是否有MultipartUpload ID 120 | uploadId, err := checkMultipartUploadId(tofileInfo, multipartUploadsList) 121 | return uploadId, err 122 | } 123 | 124 | func checkMultipartUploadId(tofileInfo FileInfo, multipartUploadsList []*s3.MultipartUpload) (string, error) { 125 | if tofileInfo.Size < cfg.ResumableThreshold { 126 | return "", nil // 文件小于ResumableThreshold,不需要分片 127 | } 128 | // 查所有相同Key的ID给keyIDList 129 | var keyIDList []*s3.MultipartUpload 130 | for _, u := range multipartUploadsList { 131 | if *u.Key == tofileInfo.ToKey { 132 | keyIDList = append(keyIDList, u) 133 | } 134 | } 135 | 136 | // 如果找不到上传过的MultipartUpload,则从头开始传 137 | if len(keyIDList) == 0 { 138 | return "", nil 139 | } 140 | 141 | // 对同一个Key的不同MultipartUpload ID排序找出时间最晚的值 142 | var latestUpload *s3.MultipartUpload 143 | for _, u := range keyIDList { 144 | if latestUpload == nil || u.Initiated.After(*latestUpload.Initiated) { 145 | latestUpload = u 146 | } 147 | } 148 | 149 | return *latestUpload.UploadId, nil 150 | } 151 | 152 | func getUploadId(svc *s3.S3, fileInfo FileInfo, multipartUploadsList []*s3.MultipartUpload, targetObjectList []*s3.Object) (string, error) { 153 | var uploadId string 154 | var err error 155 | if !cfg.SkipCompare { // 设置不做Compare了就不对比目的对象,直接覆盖 156 | if cfg.TransferMetadata || !cfg.ListTarget { // 要传metadata就必须用Head方式去获取对比;不ListTarget也是逐个Head去对比 157 | uploadId, err = checkFileExistHead(svc, fileInfo, multipartUploadsList) 158 | if err != nil { 159 | log.Printf("failed to checkFileExistHead, %v", err) 160 | return "", err 161 | } 162 | } else if cfg.ListTarget && !cfg.TransferMetadata { // 不要metadata就用list方式去获取对比(如果设置了ListTraget True) 163 | uploadId, err = checkFileExistList(fileInfo, targetObjectList, multipartUploadsList) 164 | if err != nil { 165 | log.Printf("failed to checkFileExistList, %v", err) 166 | return "", err 167 | } 168 | } 169 | } 170 | return uploadId, nil 171 | } 172 | 173 | func getMultipartUploadList(svc *s3.S3, bucket string, prefix string) ([]*s3.MultipartUpload, error) { 174 | // log.Printf("Listing multipart uploads ID in target s3://%s\n", path.Join(bucket, prefix)) 175 | var uploads []*s3.MultipartUpload 176 | err := svc.ListMultipartUploadsPages(&s3.ListMultipartUploadsInput{ 177 | Bucket: aws.String(bucket), 178 | Prefix: aws.String(prefix), 179 | }, func(page *s3.ListMultipartUploadsOutput, lastPage bool) bool { 180 | uploads = append(uploads, page.Uploads...) 181 | return true // return false to stop pagination 182 | }) 183 | 184 | if err != nil { 185 | return nil, err 186 | } 187 | log.Printf("There are %d multipart uploads ID already in target s3://%s\n", len(uploads), path.Join(bucket, prefix)) 188 | 189 | return uploads, nil 190 | } 191 | 192 | func getS3ObjectList(b BInfo) ([]*s3.Object, error) { 193 | log.Printf("Listing s3://%s\n", path.Join(b.bucket, b.prefix)) 194 | var s3Objects []*s3.Object 195 | var mu sync.Mutex 196 | var wg sync.WaitGroup 197 | var sem = semaphore.NewWeighted(int64(cfg.NumWorkers * 4)) 198 | 199 | concurrencyListObjects(b.svc, b.bucket, b.prefix, sem, &s3Objects, &mu, &wg) 200 | wg.Wait() 201 | log.Printf("There are %d objects already in s3://%s\n", len(s3Objects), path.Join(b.bucket, b.prefix)) 202 | 203 | return s3Objects, nil 204 | } 205 | 206 | func concurrencyListObjects(svc *s3.S3, bucket, prefix string, sem *semaphore.Weighted, 207 | s3Objects *[]*s3.Object, mu *sync.Mutex, wg *sync.WaitGroup) { 208 | params := &s3.ListObjectsV2Input{ 209 | Bucket: aws.String(bucket), 210 | Prefix: aws.String(prefix), 211 | Delimiter: aws.String("/"), 212 | } 213 | 214 | err := svc.ListObjectsV2Pages(params, 215 | func(page *s3.ListObjectsV2Output, lastPage bool) bool { 216 | mu.Lock() 217 | *s3Objects = append(*s3Objects, page.Contents...) 218 | mu.Unlock() 219 | 220 | for _, commonPrefix := range page.CommonPrefixes { 221 | wg.Add(1) 222 | go func(p string) { 223 | defer sem.Release(1) 224 | defer wg.Done() 225 | sem.Acquire(context.Background(), 1) // 要放go func里面,因为上级线程需要继续运行下去 226 | concurrencyListObjects(svc, bucket, p, sem, s3Objects, mu, wg) //每个Prefix递归并发新线程 227 | }(*commonPrefix.Prefix) 228 | } 229 | return !lastPage 230 | }) 231 | if err != nil { 232 | fmt.Printf("Error listing s3 objects: %v", err) 233 | } 234 | } 235 | 236 | func getMetadata(b BInfo, fileInfo *FileInfo) error { 237 | log.Printf("-->Get metadata s3://%s\n", path.Join(fileInfo.FromBucket, fileInfo.FromKey)) 238 | headResp, err := b.svc.HeadObject(&s3.HeadObjectInput{ 239 | Bucket: aws.String(fileInfo.FromBucket), 240 | Key: aws.String(fileInfo.FromKey), 241 | }) 242 | if err != nil { 243 | log.Printf("failed to get object metadata, %v", err) 244 | } 245 | 246 | fileInfo.Others = MetaStruct{ 247 | Metadata: headResp.Metadata, 248 | ContentType: headResp.ContentType, 249 | ContentLanguage: headResp.ContentLanguage, 250 | ContentEncoding: headResp.ContentEncoding, 251 | CacheControl: headResp.CacheControl, 252 | ContentDisposition: headResp.ContentDisposition, 253 | } 254 | return nil 255 | } 256 | -------------------------------------------------------------------------------- /download.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "log" 8 | "os" 9 | "path" 10 | "path/filepath" 11 | "strings" 12 | "sync" 13 | "sync/atomic" 14 | 15 | "github.com/aws/aws-sdk-go/aws" 16 | "github.com/aws/aws-sdk-go/service/s3" 17 | "golang.org/x/sync/semaphore" 18 | ) 19 | 20 | func startDownload(from, to BInfo) error { 21 | var wg sync.WaitGroup 22 | var err error 23 | semFile := semaphore.NewWeighted(int64(cfg.NumWorkers)) // 并发量为NumWorkers的信号量 for file 24 | semPart := semaphore.NewWeighted(int64(cfg.NumWorkers * 4)) // 并发量为NumWorkers的信号量 for parts 25 | ignoreList := getIgnoreList() 26 | 27 | err = from.svc.ListObjectsV2Pages(&s3.ListObjectsV2Input{ 28 | Bucket: aws.String(from.bucket), 29 | Prefix: aws.String(from.prefix), 30 | }, func(page *s3.ListObjectsV2Output, lastPage bool) bool { 31 | for _, item := range page.Contents { 32 | // Skip if the object is a directory 33 | if strings.HasSuffix(*item.Key, "/") { 34 | log.Println("...Skiping directory", *item.Key) 35 | continue 36 | } 37 | // Skip if key in ignoreList 38 | if isIgnored(*item.Key, ignoreList) { 39 | log.Println("...Skiping ignored key in ignoreList", *item.Key) 40 | } 41 | 42 | var combinedKey string 43 | if *item.Key != from.prefix { 44 | // 只带上Prefix以内的目录结构 45 | combinedKey = strings.TrimPrefix(*item.Key, from.prefix) 46 | } else { 47 | // 单个文件的时候存在*item.Key == from.prefix的情况 48 | combinedKey = filepath.Base(*item.Key) 49 | } 50 | 51 | localPath := filepath.Join(to.url, combinedKey) 52 | // Check if file already exists and is the same size 53 | info, err := os.Stat(localPath) 54 | if !cfg.SkipCompare { 55 | if err == nil && info.Size() == *item.Size { 56 | log.Println("...File exists and same size, skipping", localPath) 57 | continue 58 | } else if err != nil && !os.IsNotExist(err) { 59 | log.Println("Failed to stat file", localPath, err) 60 | continue 61 | } 62 | } 63 | 64 | // Create necessary directories 65 | thisdir := filepath.Dir(localPath) 66 | if err := os.MkdirAll(thisdir, 0755); err != nil { 67 | log.Println("Failed to create directories:", localPath, err) 68 | continue 69 | } 70 | 71 | semFile.Acquire(context.Background(), 1) //从线程信号池中获取,没有线程可用了就阻塞等待 72 | atomic.AddInt32(&runningGoroutines, 1) //线程计数 73 | wg.Add(1) 74 | go func(item *s3.Object) { 75 | defer wg.Done() 76 | defer semFile.Release(1) 77 | defer atomic.AddInt32(&runningGoroutines, -1) 78 | 79 | // 小文件 80 | if *item.Size < cfg.ResumableThreshold { 81 | log.Println(" Start to download (=ResumableThreshold):", localPath, "runningGoroutines:", runningGoroutines) 106 | multipart_download_finished := false 107 | file, err := os.OpenFile(localPath+".s3tmp", os.O_CREATE|os.O_WRONLY, 0644) 108 | if err != nil { 109 | log.Println("Failed to create s3tmp file", localPath, err) 110 | return 111 | } 112 | defer func() { 113 | file.Close() // 确保在 file close之后再执行rename 114 | if multipart_download_finished { 115 | // 检查文件是否存在, 如果文件存在,重命名为 localPath 116 | if _, err := os.Stat(localPath + ".s3tmp"); err == nil { 117 | // 118 | if err := os.Rename(localPath+".s3tmp", localPath); err != nil { 119 | log.Println(err, localPath) 120 | } 121 | } else if !os.IsNotExist(err) { 122 | log.Println(err, localPath) 123 | } // 如果文件不存在,跳过 124 | } 125 | }() 126 | 127 | fileInfo := FileInfo{ 128 | FromKey: *item.Key, 129 | FromBucket: from.bucket, 130 | Size: *item.Size, 131 | File: file, 132 | } 133 | indexList, chunkSizeAuto := split(fileInfo, cfg.ChunkSize) 134 | partnumberList, _ := getDownloadedParts(fileInfo) 135 | if len(partnumberList) != 0 { 136 | log.Printf("Exist %d/%d parts on local path: %s, %v\n", len(partnumberList), len(indexList), localPath+".s3tmp", partnumberList) 137 | } 138 | var wg2 sync.WaitGroup 139 | for i, offset := range indexList { 140 | if !contains(partnumberList, i+1) { 141 | size := chunkSizeAuto 142 | if offset+chunkSizeAuto > fileInfo.Size { 143 | size = fileInfo.Size - offset 144 | } 145 | partInfo := PartInfo{ 146 | FromKey: fileInfo.FromKey, 147 | FromBucket: fileInfo.FromBucket, 148 | PartNumber: int64(i + 1), 149 | Size: size, 150 | Offset: offset, 151 | TotalParts: int64(len(indexList)), 152 | } 153 | 154 | semPart.Acquire(context.Background(), 1) //从线程池中获取,没有线程可用了就阻塞等待 155 | atomic.AddInt32(&runningGoroutines, 1) //线程计数 156 | wg2.Add(1) 157 | go downloadPart(from.svc, partInfo, fileInfo.File, &wg2, semPart) 158 | } 159 | } 160 | // Clean up download part records, statstic counts 161 | wg2.Wait() 162 | deleteDownloadParts(fileInfo) 163 | multipart_download_finished = true 164 | } 165 | log.Println("***Successfully downloaded:", localPath) 166 | atomic.AddInt64(&objectCount, 1) 167 | atomic.AddInt64(&sizeCount, *item.Size) 168 | }(item) 169 | } 170 | return true 171 | }) 172 | if err != nil { 173 | log.Println("Failed to list objects", err) 174 | return err 175 | } 176 | wg.Wait() 177 | return err 178 | } 179 | 180 | func downloadPartAction(svc *s3.S3, partInfo PartInfo) ([]byte, error) { 181 | log.Printf("-->Downloading part s3://%s %d/%d, runningGoroutines: %d\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts, runningGoroutines) 182 | input := &s3.GetObjectInput{ 183 | Bucket: &partInfo.FromBucket, 184 | Key: &partInfo.FromKey, 185 | Range: aws.String(fmt.Sprintf("bytes=%d-%d", partInfo.Offset, partInfo.Offset+partInfo.Size-1)), 186 | } 187 | if from.requestPayer { 188 | input.RequestPayer = aws.String("requester") 189 | } 190 | resp, err := svc.GetObject(input) 191 | if err != nil { 192 | log.Println("Failed to download part", partInfo.FromBucket, partInfo.FromKey, partInfo.PartNumber, err) 193 | return nil, err 194 | } 195 | defer resp.Body.Close() 196 | buffer, err := io.ReadAll(resp.Body) 197 | if err != nil { 198 | log.Println("Failed to read from response body:", partInfo.FromBucket, partInfo.FromKey, partInfo.PartNumber, err) 199 | return nil, err 200 | } 201 | return buffer, nil 202 | } 203 | 204 | func downloadPart(svc *s3.S3, partInfo PartInfo, file *os.File, wg *sync.WaitGroup, semPart *semaphore.Weighted) error { 205 | defer wg.Done() 206 | defer semPart.Release(1) 207 | defer atomic.AddInt32(&runningGoroutines, -1) 208 | 209 | // Download part S3 API Call 210 | buffer, err := downloadPartAction(svc, partInfo) 211 | if err != nil { 212 | return err 213 | } 214 | // Write the part to file 215 | if _, err := file.WriteAt(buffer, partInfo.Offset); err != nil { 216 | log.Println("Failed to write to file", partInfo.FromBucket, partInfo.FromKey, partInfo.PartNumber, err) 217 | return err 218 | } 219 | 220 | // Record the download part 221 | recordDownloadPart(partInfo) 222 | log.Printf("===Downloaded part s3://%s part:%d/%d\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts) 223 | return nil 224 | } 225 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module s3trans 2 | 3 | go 1.22.7 4 | 5 | require ( 6 | github.com/aws/aws-sdk-go v1.55.5 7 | github.com/google/uuid v1.6.0 8 | github.com/mattn/go-sqlite3 v1.14.24 9 | github.com/spf13/cobra v1.8.1 10 | github.com/spf13/viper v1.19.0 11 | golang.org/x/sync v0.10.0 12 | ) 13 | 14 | require ( 15 | github.com/fsnotify/fsnotify v1.7.0 // indirect 16 | github.com/hashicorp/hcl v1.0.0 // indirect 17 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 18 | github.com/jmespath/go-jmespath v0.4.0 // indirect 19 | github.com/magiconair/properties v1.8.7 // indirect 20 | github.com/mitchellh/mapstructure v1.5.0 // indirect 21 | github.com/pelletier/go-toml/v2 v2.2.2 // indirect 22 | github.com/sagikazarmark/locafero v0.4.0 // indirect 23 | github.com/sagikazarmark/slog-shim v0.1.0 // indirect 24 | github.com/sourcegraph/conc v0.3.0 // indirect 25 | github.com/spf13/afero v1.11.0 // indirect 26 | github.com/spf13/cast v1.6.0 // indirect 27 | github.com/spf13/pflag v1.0.5 // indirect 28 | github.com/subosito/gotenv v1.6.0 // indirect 29 | go.uber.org/atomic v1.9.0 // indirect 30 | go.uber.org/multierr v1.9.0 // indirect 31 | golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect 32 | golang.org/x/sys v0.18.0 // indirect 33 | golang.org/x/text v0.14.0 // indirect 34 | gopkg.in/ini.v1 v1.67.0 // indirect 35 | gopkg.in/yaml.v3 v3.0.1 // indirect 36 | ) 37 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= 2 | github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= 3 | github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= 4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 6 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= 7 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 8 | github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= 9 | github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= 10 | github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= 11 | github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= 12 | github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= 13 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 14 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 15 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 16 | github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= 17 | github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= 18 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= 19 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= 20 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= 21 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= 22 | github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= 23 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= 24 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 25 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 26 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 27 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 28 | github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= 29 | github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= 30 | github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM= 31 | github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= 32 | github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= 33 | github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= 34 | github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= 35 | github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= 36 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 37 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= 38 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 39 | github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= 40 | github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= 41 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 42 | github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ= 43 | github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4= 44 | github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= 45 | github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= 46 | github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= 47 | github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= 48 | github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= 49 | github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= 50 | github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0= 51 | github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= 52 | github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= 53 | github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= 54 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= 55 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 56 | github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI= 57 | github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg= 58 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 59 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 60 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 61 | github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= 62 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 63 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 64 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 65 | github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= 66 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= 67 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 68 | github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= 69 | github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= 70 | go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= 71 | go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= 72 | go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= 73 | go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= 74 | golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= 75 | golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= 76 | golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= 77 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 78 | golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= 79 | golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 80 | golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= 81 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 82 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 83 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= 84 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 85 | gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= 86 | gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= 87 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= 88 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 89 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 90 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 91 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 92 | -------------------------------------------------------------------------------- /http_download.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "context" 6 | "fmt" 7 | "io" 8 | "log" 9 | "net/http" 10 | "net/url" 11 | "os" 12 | "path" 13 | "path/filepath" 14 | "strconv" 15 | "strings" 16 | "sync" 17 | "sync/atomic" 18 | "time" 19 | 20 | "golang.org/x/sync/semaphore" 21 | ) 22 | 23 | func startHttpDownload(from, to BInfo) error { 24 | var wg sync.WaitGroup 25 | semFile := semaphore.NewWeighted(int64(cfg.NumWorkers)) // 并发量为NumWorkers的信号量 for file 26 | var httpList []string 27 | 28 | switch cfg.WorkMode { 29 | case "HTTP_DOWNLOAD": 30 | httpList = append(httpList, from.url) 31 | case "HTTP_DOWNLOAD_LIST": 32 | // Read localfile of presign url lines as list from from.url 33 | file, err := os.Open(from.url) 34 | if err != nil { 35 | log.Println("Failed to open file of HTTP_DOWNLOAD_LIST", err) 36 | return err 37 | } 38 | defer file.Close() 39 | 40 | scanner := bufio.NewScanner(file) 41 | for scanner.Scan() { 42 | httpList = append(httpList, scanner.Text()) 43 | } 44 | log.Println("Read local file of HTTP_DOWNLOAD_LIST, total:", len(httpList)) 45 | } 46 | 47 | for _, thisUrl := range httpList { 48 | semFile.Acquire(context.Background(), 1) //从线程信号池中获取,没有线程可用了就阻塞等待 49 | atomic.AddInt32(&runningGoroutines, 1) //线程计数 50 | wg.Add(1) 51 | go downloadHTTPFile(thisUrl, &wg, semFile) 52 | } 53 | wg.Wait() 54 | return nil 55 | } 56 | 57 | func downloadHTTPFile(thisUrl string, wg *sync.WaitGroup, semFile *semaphore.Weighted) error { 58 | defer wg.Done() 59 | defer semFile.Release(1) 60 | defer atomic.AddInt32(&runningGoroutines, -1) 61 | 62 | // Download each object 63 | URL, err := url.Parse(thisUrl) 64 | if err != nil { 65 | log.Printf("Invalid HTTP URL: %s, %v\n", thisUrl, err) 66 | return err 67 | } 68 | from.bucket = strings.Split(URL.Host, ".")[0] 69 | fullPrefix := strings.TrimSuffix(strings.TrimPrefix(URL.Path, "/"), "/") 70 | fileName := filepath.Base(fullPrefix) 71 | localPath := filepath.Join(to.url, fileName) 72 | 73 | // Get the file size 74 | fileSize, err := getHTTPFileSize(thisUrl) 75 | if err != nil { 76 | log.Println("Failed to get file size:", thisUrl, err) 77 | return err 78 | } 79 | 80 | // Check if file already exists and is the same size 81 | info, err := os.Stat(localPath) 82 | if !cfg.SkipCompare { 83 | if err == nil && info.Size() == fileSize { 84 | log.Println("...File exists and same size, skipping", localPath) 85 | return nil 86 | } else if err != nil && !os.IsNotExist(err) { 87 | log.Println("Failed to stat file", localPath, err) 88 | return err 89 | } 90 | } 91 | 92 | log.Println(" Start to https download:", localPath) 93 | multipart_download_finished := false 94 | 95 | // Create necessary directories 96 | thisdir := filepath.Dir(localPath) 97 | if err := os.MkdirAll(thisdir, 0755); err != nil { 98 | log.Println("Failed to create directories:", localPath, err) 99 | return err 100 | } 101 | 102 | file, err := os.OpenFile(localPath+".s3tmp", os.O_CREATE|os.O_WRONLY, 0644) 103 | if err != nil { 104 | log.Println("Failed to create s3tmp file:", localPath, err) 105 | return err 106 | } 107 | defer func() { 108 | file.Close() // 确保在 file close之后再执行rename 109 | if multipart_download_finished { 110 | // 检查文件是否存在, 如果文件存在,重命名为 localPath 111 | if _, err := os.Stat(localPath + ".s3tmp"); err == nil { 112 | // 113 | if err := os.Rename(localPath+".s3tmp", localPath); err != nil { 114 | log.Println(err, localPath) 115 | } 116 | } else if !os.IsNotExist(err) { 117 | log.Println(err, localPath) 118 | } // 如果文件不存在,跳过 119 | } 120 | }() 121 | 122 | // list parts numbers 123 | fileInfo := FileInfo{ 124 | FromKey: fullPrefix, 125 | FromBucket: from.bucket, 126 | Size: fileSize, 127 | File: file, 128 | } 129 | indexList, chunkSizeAuto := split(fileInfo, cfg.ChunkSize) 130 | partnumberList, _ := getDownloadedParts(fileInfo) 131 | if len(partnumberList) != 0 { 132 | log.Printf("Exist %d/%d parts on local path: %s, %v\n", len(partnumberList), len(indexList), localPath+".s3tmp", partnumberList) 133 | } 134 | 135 | // Follow indexList to download parts 136 | var wg2 sync.WaitGroup 137 | semPart := semaphore.NewWeighted(int64(cfg.NumWorkers * 4)) // 并发量为NumWorkers的信号量 for parts 138 | 139 | for i, offset := range indexList { 140 | if !contains(partnumberList, i+1) { 141 | size := chunkSizeAuto 142 | if offset+chunkSizeAuto > fileInfo.Size { 143 | size = fileInfo.Size - offset 144 | } 145 | partInfo := PartInfo{ 146 | FromBucket: from.bucket, 147 | FromKey: fullPrefix, 148 | URL: thisUrl, 149 | PartNumber: int64(i + 1), 150 | Size: size, 151 | Offset: offset, 152 | TotalParts: int64(len(indexList)), 153 | } 154 | 155 | semPart.Acquire(context.Background(), 1) //从线程池中获取,没有线程可用了就阻塞等待 156 | atomic.AddInt32(&runningGoroutines, 1) //线程计数 157 | wg2.Add(1) 158 | go downloadHttpChunk(partInfo, fileInfo.File, &wg2, semPart) 159 | } 160 | } 161 | // Clean up download part records, statstic counts 162 | wg2.Wait() 163 | deleteDownloadParts(fileInfo) 164 | multipart_download_finished = true 165 | log.Println(" Finish https download:", localPath) 166 | atomic.AddInt64(&objectCount, 1) 167 | atomic.AddInt64(&sizeCount, fileSize) 168 | return nil 169 | } 170 | 171 | func downloadHttpChunk(partInfo PartInfo, file *os.File, wg *sync.WaitGroup, semPart *semaphore.Weighted) error { 172 | defer wg.Done() 173 | defer semPart.Release(1) 174 | defer atomic.AddInt32(&runningGoroutines, -1) 175 | 176 | // Download part HTTP API Call 177 | buffer, err := getHTTPFileBody(partInfo) 178 | if err != nil { 179 | return err 180 | } 181 | // Write the part to file 182 | if _, err := file.WriteAt(buffer, partInfo.Offset); err != nil { 183 | log.Printf("Failed to write part s3://%s part:%d/%d, err: %v\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts, err) 184 | return err 185 | } 186 | 187 | // Record the download part 188 | recordDownloadPart(partInfo) 189 | log.Printf("===Downloaded part s3://%s part:%d/%d\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts) 190 | return nil 191 | 192 | } 193 | 194 | func getHTTPFileBody(partInfo PartInfo) ([]byte, error) { 195 | log.Printf("-->Downloading part s3://%s %d/%d, runningGoroutines: %d\n", path.Join(partInfo.FromBucket, partInfo.FromKey), partInfo.PartNumber, partInfo.TotalParts, runningGoroutines) 196 | 197 | req, err := http.NewRequest("GET", partInfo.URL, nil) 198 | if err != nil { 199 | fmt.Println("Error creating request:", err) 200 | return nil, err 201 | } 202 | 203 | req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", partInfo.Offset, partInfo.Offset+partInfo.Size-1)) 204 | 205 | retryRoundTripper := &RetryRoundTripper{ 206 | Proxied: http.DefaultTransport, 207 | Retries: 3, // Set the desired number of retries 208 | Delay: time.Second * 5, // Set the desired delay between retries 209 | } 210 | client := &http.Client{ 211 | Transport: retryRoundTripper, 212 | } 213 | 214 | resp, err := client.Do(req) 215 | if err != nil { 216 | fmt.Println("Error downloading chunk:", err) 217 | return nil, err 218 | } 219 | defer resp.Body.Close() 220 | 221 | buffer := make([]byte, partInfo.Size) 222 | _, err = io.ReadFull(resp.Body, buffer) 223 | if err != nil { 224 | fmt.Println("Error reading chunk:", err) 225 | return nil, err 226 | } 227 | 228 | return buffer, nil 229 | } 230 | 231 | func getHTTPFileSize(thisUrl string) (int64, error) { 232 | req, err := http.NewRequest("GET", thisUrl, nil) 233 | if err != nil { 234 | log.Println("Failed to create request for:", thisUrl, err) 235 | return 0, err 236 | } 237 | req.Header.Set("Range", "bytes=0-0") 238 | 239 | retryRoundTripper := &RetryRoundTripper{ 240 | Proxied: http.DefaultTransport, 241 | Retries: 3, // Set the desired number of retries 242 | Delay: time.Second * 5, // Set the desired delay between retries 243 | } 244 | client := &http.Client{ 245 | Transport: retryRoundTripper, 246 | } 247 | 248 | resp, err := client.Do(req) 249 | if err != nil { 250 | log.Println("Failed to GET file size for:", thisUrl, err) 251 | return 0, err 252 | } 253 | defer resp.Body.Close() 254 | if resp.StatusCode != http.StatusPartialContent { 255 | log.Println("unexpected status code while GET file size:", resp.StatusCode, thisUrl) 256 | return 0, err 257 | } 258 | fileSizeStr := resp.Header.Get("Content-Range") 259 | if fileSizeStr == "" { 260 | log.Println("missing Content-Range header while GET file size for:", thisUrl) 261 | return 0, err 262 | } 263 | parts := strings.Split(fileSizeStr, "/") 264 | if len(parts) != 2 { 265 | log.Println("invalid Content-Range header format while GET file size for:", thisUrl) 266 | return 0, err 267 | } 268 | 269 | fileSize, err := strconv.ParseInt(parts[1], 10, 64) 270 | if err != nil { 271 | log.Println("Failed to parse file size for:", thisUrl, err) 272 | return 0, err 273 | } 274 | return fileSize, nil 275 | } 276 | -------------------------------------------------------------------------------- /img/arch-cn1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-s3-resumable-upload/ccc06cc59b4bac7aa092450099c5bbc29f022914/img/arch-cn1.png -------------------------------------------------------------------------------- /img/arch-cn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-s3-resumable-upload/ccc06cc59b4bac7aa092450099c5bbc29f022914/img/arch-cn2.png -------------------------------------------------------------------------------- /img/arch-en1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-s3-resumable-upload/ccc06cc59b4bac7aa092450099c5bbc29f022914/img/arch-en1.png -------------------------------------------------------------------------------- /img/arch-en2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-s3-resumable-upload/ccc06cc59b4bac7aa092450099c5bbc29f022914/img/arch-en2.png -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | // 多线程并发断点续传上传/下载S3,支持支持Amazon S3, Ali OSS, Tencent COS, Google GCS 等兼容S3 API的对象存储 2 | // 使用 ./s3trans -h 获取更多帮助信息 3 | package main 4 | 5 | import ( 6 | "fmt" 7 | "log" 8 | "net/http" 9 | "net/url" 10 | "os" 11 | "strings" 12 | "time" 13 | 14 | "github.com/aws/aws-sdk-go/aws" 15 | "github.com/aws/aws-sdk-go/aws/credentials" 16 | "github.com/aws/aws-sdk-go/aws/session" 17 | "github.com/aws/aws-sdk-go/service/s3" 18 | "github.com/aws/aws-sdk-go/service/s3/s3manager" 19 | "github.com/aws/aws-sdk-go/service/sqs" 20 | _ "github.com/mattn/go-sqlite3" // 导入SQLite3包但不使用,只用其驱动 21 | "github.com/spf13/cobra" 22 | "github.com/spf13/viper" 23 | ) 24 | 25 | type Config struct { 26 | ListTarget bool `mapstructure:"list-target"` // 一次性从目标S3获取列表进行对比再开始传输,文件数量大的情况可以节省每次请求之前逐个文件对比的API Call 27 | SkipCompare bool `mapstructure:"skip-compare"` // 是否不做目标S3与源文件的对比,即无论是否有重复文件,都直接开始传输并覆盖 28 | TransferMetadata bool `mapstructure:"transfer-metadata"` // 是否传输源S3 Object MetaData到目标S3,只在S3toS3模式下可用 29 | HttpTimeout int `mapstructure:"http-timeout"` // S3 http 超时时间(秒) 30 | MaxRetries int `mapstructure:"max-retries"` // API 请求最大重试次数 31 | ResumableThreshold int64 `mapstructure:"resumable-threshold"` // 走断点续传流程的门槛,小于该值则直接并发下载,对于文件不大或不担心中断的情况效率更高(单位MB) 32 | NumWorkers int `mapstructure:"num-workers"` // 控制 goroutine 总量 33 | WorkMode string `mapstructure:"work-mode"` // SQS_SEND | SQS_CONSUME 34 | SQSUrl string `mapstructure:"sqs-url"` // SQS Queue URL 35 | SQSProfile string `mapstructure:"sqs-profile"` // SQS Queue Profile 36 | YPtr bool `mapstructure:"y"` // Ignore waiting for confirming command 37 | DBPath string `mapstructure:"db-location"` // 自动创建已经下载的分片状态记录数据库 38 | ChunkSize int64 `mapstructure:"chunk-size"` // Multipart 分片大小 39 | RetryDelay int `mapstructure:"retry-delay"` // API 请求重试延迟时间(秒) 40 | JobListPath string `mapstructure:"joblist-write-to-filepath"` // 列出S3传输任务之后,写入到一个文件作为备份 41 | SQSSentLogName string `mapstructure:"sqs-log-to-filename"` // SQS已发送消息的记录文件名 42 | IgnoreListPath string `mapstructure:"ignore-list-path"` // List和传输的时候,如果S3源的Key或本地源路径的前缀在Ignore List里面,则跳过。设置的时候注意S3的Key是不带“/”开头的 43 | ForcePathStyle bool `mapstructure:"force-path-style"` // 强制使用路径方式访问S3,而不是域名方式 44 | } 45 | 46 | type BInfo struct { 47 | url, bucket, prefix, profile, endpoint, region, storageClass, ACL string 48 | noSignRequest bool // The bucket is noSignRequest, no need to sign 49 | requestPayer bool // The bucket is requestPayer 50 | sess *session.Session 51 | svc *s3.S3 52 | downloader *s3manager.Downloader 53 | uploader *s3manager.Uploader 54 | } 55 | 56 | type MetaStruct struct { 57 | Metadata map[string]*string 58 | ContentType, ContentLanguage, ContentEncoding, CacheControl, ContentDisposition *string 59 | Expires *time.Time 60 | } 61 | 62 | type FileInfo struct { 63 | FromKey, FromBucket, ToKey, ToBucket string 64 | Size int64 65 | File *os.File 66 | Others MetaStruct 67 | } 68 | 69 | type PartInfo struct { 70 | FromKey, FromBucket, ToKey, ToBucket, Etag string 71 | Size, Offset int64 72 | PartNumber, TotalParts int64 73 | URL string 74 | } 75 | 76 | type RetryFunc func() error 77 | 78 | var ( 79 | from, to BInfo 80 | objectCount, sizeCount int64 81 | runningGoroutines int32 // 当前正在运行的 goroutine 的数量 82 | cfg Config 83 | sqsSvc *sqs.SQS 84 | ) 85 | 86 | var rootCmd = &cobra.Command{ 87 | Use: "s3trans FROM_URL TO_URL", 88 | Short: "s3trans transfers data from source to target", 89 | Long: `s3trans transfers data from source to target. 90 | ./s3trans FROM_URL TO_URL [OPTIONS] 91 | FROM_URL: The url of data source, e.g. /home/user/data or s3://bucket/prefix 92 | TO_URL: The url of data transfer target, e.g. /home/user/data or s3://bucket/prefix 93 | For example: 94 | ./s3trans s3://bucket/prefix s3://bucket/prefix -from_profile sin -to_profile bjs 95 | ./s3trans s3://bucket/prefix /home/user/data -from_profile sin 96 | `, 97 | Args: cobra.ExactArgs(2), // 要求必须提供2个参数 98 | Run: func(cmd *cobra.Command, args []string) { 99 | // args[0] 是 FROM_URL, args[1] 是 TO_URL 100 | from.url = args[0] 101 | to.url = args[1] 102 | }, 103 | } 104 | 105 | func init() { 106 | rootCmd.SetHelpFunc(func(cmd *cobra.Command, args []string) { 107 | fmt.Print(cmd.Long) 108 | os.Exit(0) 109 | }) 110 | cobra.OnInitialize(initConfig) 111 | rootCmd.PersistentFlags().String("from-profile", "", "The AWS profile in ~/.aws/credentials of data source") 112 | viper.BindPFlag("from-profile", rootCmd.PersistentFlags().Lookup("from-profile")) 113 | rootCmd.PersistentFlags().Bool("force-path-style", false, "Set this to `true` to force the request to use path-style addressing See http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html") 114 | viper.BindPFlag("force-path-style", rootCmd.PersistentFlags().Lookup("force-path-style")) 115 | rootCmd.PersistentFlags().String("to-profile", "", "The AWS profile in ~/.aws/credentials of data transfer target") 116 | viper.BindPFlag("to-profile", rootCmd.PersistentFlags().Lookup("to-profile")) 117 | rootCmd.PersistentFlags().String("from-endpoint", "", "The endpoint of data source, e.g. https://storage.googleapis.com; https://oss-.aliyuncs.com; https://cos..myqcloud.com . If AWS s3 or local path, no need to specify this.") 118 | viper.BindPFlag("from-endpoint", rootCmd.PersistentFlags().Lookup("from-endpoint")) 119 | rootCmd.PersistentFlags().String("to-endpoint", "", "The endpoint of data transfer target, e.g. https://storage.googleapis.com . If AWS s3 or local path, no need to specify this.") 120 | viper.BindPFlag("to-endpoint", rootCmd.PersistentFlags().Lookup("to-endpoint")) 121 | rootCmd.PersistentFlags().String("from-region", "", "The region of data transfer source, e.g. cn-north-1. If no specified, the region will be auto detected with the credentials you provided in profile.") 122 | viper.BindPFlag("from-region", rootCmd.PersistentFlags().Lookup("from-region")) 123 | rootCmd.PersistentFlags().String("to-region", "", "The region of data transfer target, e.g. us-east-1. If no specified, the region will be auto detected with the credentials you provided in profile.") 124 | viper.BindPFlag("to-region", rootCmd.PersistentFlags().Lookup("to-region")) 125 | rootCmd.PersistentFlags().String("storage-class", "", "The TARGET S3 bucket storage class, e.g. STANDARD|REDUCED_REDUNDANCY|STANDARD_IA|ONEZONE_IA|INTELLIGENT_TIERING|GLACIER|DEEP_ARCHIVE|OUTPOSTS|GLACIER_IR|SNOW or others of S3 compatibale") 126 | viper.BindPFlag("storage-class", rootCmd.PersistentFlags().Lookup("storage-class")) 127 | rootCmd.PersistentFlags().String("acl", "", "The TARGET S3 bucket ACL, private means only the object owner can read&write, e.g. private|public-read|public-read-write|authenticated-read|aws-exec-read|bucket-owner-read|bucket-owner-full-control") 128 | viper.BindPFlag("acl", rootCmd.PersistentFlags().Lookup("acl")) 129 | rootCmd.PersistentFlags().Bool("no-sign-request", false, "The SOURCE bucket is not needed to sign the request") 130 | viper.BindPFlag("no-sign-request", rootCmd.PersistentFlags().Lookup("no-sign-request")) 131 | rootCmd.PersistentFlags().Bool("request-payer", false, "The SOURCE bucket requires requester to pay, set this") 132 | viper.BindPFlag("request-payer", rootCmd.PersistentFlags().Lookup("request-payer")) 133 | rootCmd.PersistentFlags().String("db-location", "./download-status.db", "local db to record download resumable status") 134 | viper.BindPFlag("db-location", rootCmd.PersistentFlags().Lookup("db-location")) 135 | 136 | rootCmd.PersistentFlags().BoolP("list-target", "l", false, "List the TARGET S3 bucket, compare exist objects BEFORE transfer. List is more efficient than head each object to check if it exists, but transfer may start slower because it needs to wait for listing all objects to compare. To mitigate this, this app leverage Concurrency Listing for fast list; If no list-target para, transfer without listing the target S3 bucket, but before transfering each object, head each target object to check, this costs more API call, but start faster.") 137 | viper.BindPFlag("list-target", rootCmd.PersistentFlags().Lookup("list-target")) 138 | rootCmd.PersistentFlags().BoolP("skip-compare", "s", false, "If True, skip to compare the name and size between source and target S3 object. Just overwrite all objects. No list target nor head target object to check if it already exists.") 139 | viper.BindPFlag("skip-compare", rootCmd.PersistentFlags().Lookup("skip-compare")) 140 | rootCmd.PersistentFlags().Bool("transfer-metadata", false, "If True, get metadata from source S3 bucket and upload the metadata to target object. This costs more API calls.") 141 | viper.BindPFlag("transfer-metadata", rootCmd.PersistentFlags().Lookup("transfer-metadata")) 142 | 143 | rootCmd.PersistentFlags().Int("http-timeout", 30, "API request timeout (seconds)") 144 | viper.BindPFlag("http-timeout", rootCmd.PersistentFlags().Lookup("http-timeout")) 145 | rootCmd.PersistentFlags().Int("max-retries", 5, "API request max retries") 146 | viper.BindPFlag("max-retries", rootCmd.PersistentFlags().Lookup("max-retries")) 147 | rootCmd.PersistentFlags().Int("retry-delay", 5, "Delay before next retry in secondes") 148 | viper.BindPFlag("retry-delay", rootCmd.PersistentFlags().Lookup("retry-delay")) 149 | rootCmd.PersistentFlags().Int64("chunk-size", 5, "Multipart part size(MB)") 150 | viper.BindPFlag("chunk-size", rootCmd.PersistentFlags().Lookup("chunk-size")) 151 | rootCmd.PersistentFlags().Int64("resumable-threshold", 50, "When the file size (MB) is larger than this value, the file will be resumable transfered.") 152 | viper.BindPFlag("resumable-threshold", rootCmd.PersistentFlags().Lookup("resumable-threshold")) 153 | rootCmd.PersistentFlags().IntP("num-workers", "n", 4, "Max concurrent threads = NumWorkers*NumWorkers*4 (files*parts*4), recommend NumWorkers <= vCPU number") 154 | viper.BindPFlag("num-workers", rootCmd.PersistentFlags().Lookup("num-workers")) 155 | rootCmd.PersistentFlags().BoolP("y", "y", false, "Ignore waiting for confirming command") 156 | viper.BindPFlag("y", rootCmd.PersistentFlags().Lookup("y")) 157 | 158 | rootCmd.PersistentFlags().String("work-mode", "", "SQS_SEND | SQS_CONSUME | DRYRUN | HTTP_DOWNLOAD_LIST; SQS_SEND means listing source FROM_URL S3 and target TO_URL S3 to compare and send message to SQS queue, SQS_CONSUME means consume message from SQS queue and transfer objects from FROM_URL S3 to TO_URL S3; DRYRUN means only count the objects and sizes comparing delta list of FROM_URL S3 and TO_URL S3, no transfer; HTTP_DOWNLOAD_LIST, from a list file with lines of presign url;") 159 | viper.BindPFlag("work-mode", rootCmd.PersistentFlags().Lookup("work-mode")) 160 | rootCmd.PersistentFlags().String("sqs-url", "", "The SQS queue URL to send or consume message from, e.g. https://sqs.us-east-1.amazonaws.com/my_account/my_queue_name") 161 | viper.BindPFlag("sqs-url", rootCmd.PersistentFlags().Lookup("sqs-url")) 162 | rootCmd.PersistentFlags().String("sqs-profile", "", "The SQS queue leverage which AWS profile in ~/.aws/credentials") 163 | viper.BindPFlag("sqs-profile", rootCmd.PersistentFlags().Lookup("sqs-profile")) 164 | rootCmd.PersistentFlags().String("joblist-write-to-filepath", "", "After listing source and target S3, compare the delta joblist and write the joblist to this filepath, e.g. ./joblist.txt") 165 | viper.BindPFlag("joblist-write-to-filepath", rootCmd.PersistentFlags().Lookup("joblist-write-to-filepath")) 166 | rootCmd.PersistentFlags().String("sqs-log-to-filename", "", "After sent joblist to SQS, write the sent messages log to this filepath, e.g. ./sqs-log.txt") 167 | viper.BindPFlag("sqs-log-to-filename", rootCmd.PersistentFlags().Lookup("sqs-log-to-filename")) 168 | rootCmd.PersistentFlags().String("ignore-list-path", "", "When listing and transfer, if source S3 key or local path matching the prefix in this ignore-list, it will be ignored. This is useful to ignore some objects that are not needed to transfer. The ignore-list is a file path, e.g. ./ignore-list.txt") 169 | viper.BindPFlag("ignore-list-path", rootCmd.PersistentFlags().Lookup("ignore-list-path")) 170 | } 171 | 172 | func initConfig() { 173 | viper.AutomaticEnv() // read in environment variables that match 174 | 175 | // If a config file is found, read it in. 176 | viper.SetConfigFile("config.yaml") // YAML 格式配置文件 config.yaml 177 | if err := viper.ReadInConfig(); err == nil { 178 | fmt.Println("Using config file:", viper.ConfigFileUsed()) 179 | } 180 | // Unmarshal config into cfg struct 181 | if err := viper.Unmarshal(&cfg); err != nil { 182 | fmt.Println("Error unmarshalling config:", err) 183 | os.Exit(1) 184 | } 185 | } 186 | 187 | func getConfig() { 188 | if err := rootCmd.Execute(); err != nil { 189 | fmt.Println(err) 190 | os.Exit(1) 191 | } 192 | from.profile = viper.GetString("from-profile") 193 | to.profile = viper.GetString("to-profile") 194 | from.endpoint = viper.GetString("from-endpoint") 195 | to.endpoint = viper.GetString("to-endpoint") 196 | from.region = viper.GetString("from-region") 197 | to.region = viper.GetString("to-region") 198 | to.storageClass = viper.GetString("storage-class") 199 | to.ACL = viper.GetString("acl") 200 | from.noSignRequest = viper.GetBool("no-sign-request") 201 | from.requestPayer = viper.GetBool("request-payer") 202 | cfg.ResumableThreshold = cfg.ResumableThreshold * 1024 * 1024 203 | cfg.ChunkSize = cfg.ChunkSize * 1024 * 1024 204 | 205 | for i, binfo := range []*BInfo{&from, &to} { 206 | if i == 0 { 207 | fmt.Print("From ") 208 | } else { 209 | fmt.Print("To ") 210 | } 211 | if strings.HasPrefix(binfo.url, "s3://") { 212 | // Parse S3 URL 213 | URL, err := url.Parse(binfo.url) 214 | if err != nil { 215 | log.Fatalf("Invalid S3 URL: %s, %v\n", binfo.url, err) 216 | os.Exit(1) 217 | } 218 | binfo.bucket = URL.Host 219 | binfo.prefix = strings.TrimSuffix(strings.TrimPrefix(URL.Path, "/"), "/") 220 | binfo.sess = getSess(binfo) 221 | binfo.svc = s3.New(binfo.sess) 222 | if i == 0 { 223 | binfo.downloader = s3manager.NewDownloader(binfo.sess) 224 | binfo.downloader.Concurrency = cfg.NumWorkers * 4 225 | binfo.downloader.PartSize = cfg.ChunkSize 226 | } else { 227 | binfo.uploader = s3manager.NewUploader(binfo.sess) 228 | binfo.uploader.Concurrency = cfg.NumWorkers * 4 229 | binfo.uploader.PartSize = cfg.ChunkSize 230 | } 231 | fmt.Printf("Bucket: %s, Prefix: %s, Profile: %s, Endpoint-URL: %s, Region:%s\n", binfo.bucket, binfo.prefix, binfo.profile, binfo.endpoint, binfo.region) 232 | } else 233 | 234 | // TODO: Azure Blog Storage 235 | 236 | { // Support presign url 237 | if strings.HasPrefix(binfo.url, "http") { 238 | fmt.Printf("Presign URL: %s\n", binfo.url) 239 | continue 240 | } 241 | 242 | // Verify the local path 243 | urlInfo, err := os.Stat(binfo.url) 244 | if err != nil { 245 | log.Printf("Invalid path, try to create directories: %s\n", binfo.url) // 自动创建新目录 246 | if err := os.MkdirAll(binfo.url, 0755); err != nil { 247 | log.Fatalln("Failed to create directories:", binfo.url, err) 248 | } 249 | } else { 250 | if urlInfo.IsDir() && !strings.HasSuffix(binfo.url, string(os.PathSeparator)) { 251 | binfo.url += string(os.PathSeparator) 252 | } 253 | fmt.Printf("Local: %s\n", binfo.url) 254 | } 255 | } 256 | } 257 | if cfg.WorkMode == "SQS_SEND" || cfg.WorkMode == "SQS_CONSUME" { 258 | sqsSvc = getSQSsess() 259 | } 260 | } 261 | 262 | func main() { 263 | startTime := time.Now() 264 | getConfig() 265 | fmt.Printf(" Target StorageClass(default: STANDARD): %s\n Target ACL(default: private): %s\n Source noSignRequest: %t\n Source requestPayer: %t\n", to.storageClass, to.ACL, from.noSignRequest, from.requestPayer) 266 | fmt.Printf(" Transfer Metadata: %t\n List Target Before Transfer(Recommended): %t\n Skip Compare Before Transfer: %t\n", cfg.TransferMetadata, cfg.ListTarget, cfg.SkipCompare) 267 | fmt.Printf(" NumWorkers: %d for concurrency files; NumWorkers*4 for parts of each file and for listing target bucket\n", cfg.NumWorkers) 268 | fmt.Printf(" HttpTimeout: %ds\n MaxRetries: %d\n ResumableThreshold: %s\n", cfg.HttpTimeout, cfg.MaxRetries, ByteCountSI(cfg.ResumableThreshold)) 269 | fmt.Printf(" ChunkSize: %s\n", ByteCountSI(cfg.ChunkSize)) 270 | fmt.Printf(" WorkMode: %s\n SQS_PROFILE: %s\n SQS_URL: %s\n", cfg.WorkMode, cfg.SQSProfile, cfg.SQSUrl) 271 | // fmt.Printf("Start to transfer data? (y/n): \n") 272 | // if !cfg.YPtr { 273 | // var answer string 274 | // fmt.Scanln(&answer) 275 | // if answer != "y" { 276 | // log.Fatalln("Exit app with n command.") 277 | // } 278 | // } 279 | switch { 280 | case strings.ToUpper(cfg.WorkMode) == "DRYRUN": 281 | err := compareBucket(from, to, nil) 282 | if err != nil { 283 | log.Println("Failed to count:", err) 284 | return 285 | } 286 | case strings.ToUpper(cfg.WorkMode) == "SQS_SEND": 287 | err := compareBucket(from, to, sqsSvc) 288 | if err != nil { 289 | log.Println("Failed to send sqs:", err) 290 | return 291 | } 292 | case strings.ToUpper(cfg.WorkMode) == "SQS_CONSUME": 293 | err := consumeSQS(sqsSvc) 294 | if err != nil { 295 | log.Println("Failed to consume sqs:", err) 296 | return 297 | } 298 | case strings.HasPrefix(from.url, "s3://") && strings.HasPrefix(to.url, "s3://"): 299 | cfg.WorkMode = "S3TOS3" 300 | err := s3tos3(from, to) 301 | if err != nil { 302 | log.Println("Failed to s3tos3:", err) 303 | return 304 | } 305 | case strings.HasPrefix(from.url, "s3://"): 306 | cfg.WorkMode = "GET" 307 | err := startDownload(from, to) 308 | if err != nil { 309 | log.Println("Failed to download:", err) 310 | return 311 | } 312 | case strings.HasPrefix(to.url, "s3://"): 313 | cfg.WorkMode = "PUT" 314 | err := startUpload(from, to) 315 | if err != nil { 316 | log.Println("Failed to upload:", err) 317 | return 318 | } 319 | case strings.HasPrefix(from.url, "http"): 320 | cfg.WorkMode = "HTTP_DOWNLOAD" 321 | err := startHttpDownload(from, to) 322 | if err != nil { 323 | log.Println("Failed to download:", err) 324 | return 325 | } 326 | case strings.ToUpper(cfg.WorkMode) == "HTTP_DOWNLOAD_LIST": 327 | err := startHttpDownload(from, to) 328 | if err != nil { 329 | log.Println("Failed to download from list:", err) 330 | return 331 | } 332 | default: 333 | log.Fatal("ERR WorkMode, invalid FROM_URL or TO_URL") 334 | } 335 | log.Printf("\n\nTotalObjects:%d, TotalSizes:%s(%d). The program ran for %v\n", objectCount, ByteCountSI(sizeCount), sizeCount, time.Since(startTime)) 336 | log.Println("From:", from.url) 337 | log.Println("To:", to.url) 338 | } 339 | 340 | type RetryRoundTripper struct { 341 | Proxied http.RoundTripper 342 | Retries int 343 | Delay time.Duration 344 | } 345 | 346 | func (rrt *RetryRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { 347 | var resp *http.Response 348 | var err error 349 | 350 | for i := 0; i <= rrt.Retries; i++ { 351 | resp, err = rrt.Proxied.RoundTrip(req) 352 | if err != nil { 353 | log.Printf("HTTP API Request failed and retry: %s", err) 354 | time.Sleep(rrt.Delay) 355 | continue 356 | } 357 | break 358 | } 359 | return resp, err 360 | } 361 | func getSess(bInfo *BInfo) *session.Session { 362 | // 创建具有超时重试的 http 客户端 363 | client := &http.Client{ 364 | Timeout: time.Duration(cfg.HttpTimeout) * time.Second, 365 | Transport: &RetryRoundTripper{ 366 | Proxied: http.DefaultTransport, 367 | Retries: cfg.MaxRetries, 368 | Delay: time.Duration(cfg.RetryDelay) * time.Second, 369 | }, 370 | } 371 | config := aws.Config{ 372 | MaxRetries: aws.Int(cfg.MaxRetries), // 自定义S3 Client最大重试次数 373 | HTTPClient: client, // 使用自定义了超时时间的 http 客户端 374 | } 375 | if cfg.ForcePathStyle { 376 | config.S3ForcePathStyle = aws.Bool(true) // 以路径方式访问 而不是域名 377 | } 378 | if bInfo.endpoint != "" { 379 | completeEndpointURL(bInfo) // 自动完善endpoint url 380 | config.Endpoint = aws.String(bInfo.endpoint) 381 | } 382 | // 如果noSignRequest 则必须要有region 383 | if bInfo.noSignRequest { 384 | if bInfo.region != "" { 385 | config.Credentials = credentials.AnonymousCredentials 386 | } else { 387 | log.Fatalf("No region specified for noSignRequest bucket: %s\n", bInfo.bucket) 388 | } 389 | } else if bInfo.region == "" { 390 | // Call GetBucketLocation to determine the bucket's region. 391 | tempS3sess, err := session.NewSessionWithOptions(session.Options{ 392 | Config: config, 393 | Profile: bInfo.profile, // ~/.aws/目录下,文件名为config或者credentials 394 | SharedConfigState: session.SharedConfigEnable, 395 | }) 396 | if err != nil { 397 | log.Fatalf("Failed to create session with reading ~/.aws/credentials profile: %s, with endpoint: %s err: %v\n", bInfo.profile, bInfo.endpoint, err) 398 | } 399 | result, err := s3.New(tempS3sess).GetBucketLocation(&s3.GetBucketLocationInput{ 400 | Bucket: aws.String(bInfo.bucket), 401 | }) 402 | if err != nil { 403 | log.Fatalf("Failed to get bucket location: %s, err: %v\n", bInfo.bucket, err) 404 | } 405 | if result.LocationConstraint == nil { 406 | bInfo.region = "us-east-1" // Default bucket's region is us-east-1 407 | } else { 408 | bInfo.region = aws.StringValue(result.LocationConstraint) 409 | } 410 | } 411 | config.Region = aws.String(bInfo.region) 412 | sess, err := session.NewSessionWithOptions(session.Options{ 413 | Config: config, 414 | Profile: bInfo.profile, 415 | SharedConfigState: session.SharedConfigEnable, 416 | }) 417 | if err != nil { 418 | log.Fatalf("Failed to create session with reading ~/.aws/credentials profile: %s, in bucket region: %s, with endpoint: %s err: %v\n", bInfo.profile, bInfo.region, bInfo.endpoint, err) 419 | } 420 | return sess 421 | } 422 | 423 | // 自动完善endpoint url 424 | func completeEndpointURL(bInfo *BInfo) { 425 | switch bInfo.endpoint { 426 | case "Aliyun_OSS": 427 | if bInfo.region == "" { 428 | log.Fatalf("No region specified for bucket: %s\n", bInfo.bucket) 429 | } 430 | bInfo.endpoint = fmt.Sprintf("https://oss-%s.aliyuncs.com", bInfo.region) 431 | case "Tencent_COS": 432 | if bInfo.region == "" { 433 | log.Fatalf("No region specified for bucket:%s\n", bInfo.bucket) 434 | } 435 | bInfo.endpoint = fmt.Sprintf("https://cos.%s.myqcloud.com", bInfo.region) 436 | case "Google_GCS": 437 | bInfo.endpoint = "https://storage.googleapis.com" 438 | } 439 | // 都不是以上定义字符串则自直接使用endpoint url的字符串 440 | 441 | } 442 | 443 | func getSQSsess() *sqs.SQS { 444 | // get region from cfg.SQSUrl "https://sqs.us-east-1.amazonaws.com/my_account/my_queue_name" 445 | u, err := url.Parse(cfg.SQSUrl) 446 | if err != nil { 447 | log.Fatalln("fail to parse SQS url", err) 448 | } 449 | hostParts := strings.Split(u.Host, ".") 450 | if len(hostParts) < 2 { 451 | log.Fatalln("Invalid SQS URL") 452 | } 453 | SQSRegion := hostParts[1] 454 | 455 | // 创建具有超时的 http 客户端 456 | client := &http.Client{Timeout: time.Duration(cfg.HttpTimeout) * time.Second} 457 | config := aws.Config{ 458 | MaxRetries: aws.Int(cfg.MaxRetries), // 自定义S3 Client最大重试次数 459 | HTTPClient: client, // 使用自定义了超时时间的 http 客户端 460 | Region: aws.String(SQSRegion), 461 | } 462 | sqssess, err := session.NewSessionWithOptions(session.Options{ 463 | Config: config, 464 | Profile: cfg.SQSProfile, 465 | SharedConfigState: session.SharedConfigEnable, 466 | }) 467 | if err != nil { 468 | log.Fatalf("Failed to create SQS session with reading ~/.aws/credentials profile: %s, err: %v\n", from.profile, err) 469 | } 470 | sqsSvc := sqs.New(sqssess) 471 | return sqsSvc 472 | } 473 | -------------------------------------------------------------------------------- /s3tos3.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "crypto/md5" 7 | "encoding/base64" 8 | "log" 9 | "mime" 10 | "path" 11 | "path/filepath" 12 | "strings" 13 | "sync" 14 | "sync/atomic" 15 | 16 | "github.com/aws/aws-sdk-go/aws" 17 | "github.com/aws/aws-sdk-go/service/s3" 18 | "github.com/aws/aws-sdk-go/service/s3/s3manager" 19 | "golang.org/x/sync/semaphore" 20 | ) 21 | 22 | func s3tos3(from, to BInfo) error { 23 | ignoreList := getIgnoreList() 24 | var wg sync.WaitGroup 25 | semFile := semaphore.NewWeighted(int64(cfg.NumWorkers)) // 并发量为NumWorkers的信号量 for file 26 | 27 | targetObjectList := make([]*s3.Object, 0) 28 | var err error 29 | if cfg.ListTarget && !cfg.SkipCompare { 30 | targetObjectList, err = getS3ObjectList(to) // 获取目标 S3 桶中的文件列表 31 | if err != nil { 32 | return err 33 | } 34 | } 35 | 36 | multipartUploadsList, _ := getMultipartUploadList(to.svc, to.bucket, to.prefix) 37 | 38 | // 遍历源S3 39 | inputListSource := &s3.ListObjectsV2Input{ 40 | Bucket: aws.String(from.bucket), 41 | Prefix: aws.String(from.prefix), 42 | } 43 | if from.requestPayer { 44 | inputListSource.RequestPayer = aws.String("requester") 45 | } 46 | log.Printf("Listing srouce s3://%s\n", path.Join(from.bucket, from.prefix)) 47 | err = from.svc.ListObjectsV2Pages(inputListSource, func(page *s3.ListObjectsV2Output, lastPage bool) bool { 48 | for _, item := range page.Contents { 49 | // Skip if the object is a directory 50 | if strings.HasSuffix(*item.Key, "/") { 51 | log.Println("...Skipping directory", *item.Key) 52 | continue 53 | } 54 | 55 | // Skip if key in ignoreList 56 | if isIgnored(*item.Key, ignoreList) { 57 | log.Println("...Skiping ignored key in ignoreList", *item.Key) 58 | } 59 | 60 | var combinedKey string 61 | if *item.Key != from.prefix { 62 | combinedKey = path.Join(to.prefix, strings.TrimPrefix(*item.Key, from.prefix)) 63 | combinedKey = strings.TrimPrefix(combinedKey, "/") 64 | } else { 65 | combinedKey = path.Join(to.prefix, path.Base(*item.Key)) 66 | } 67 | contentType := mime.TypeByExtension(filepath.Ext(*item.Key)) 68 | fileInfo := FileInfo{ 69 | FromBucket: from.bucket, 70 | FromKey: *item.Key, 71 | ToBucket: to.bucket, 72 | ToKey: combinedKey, 73 | Size: *item.Size, 74 | Others: MetaStruct{ContentType: &contentType}, 75 | } 76 | err = s3tos3Action(from, to, fileInfo, semFile, &wg, multipartUploadsList, targetObjectList) 77 | if err != nil { 78 | log.Println("Failed to s3tos3Action", err) 79 | return false 80 | } 81 | } 82 | return true 83 | }) 84 | if err != nil { 85 | log.Println("Failed to list objects", err) 86 | return err 87 | } 88 | wg.Wait() 89 | return err 90 | } 91 | 92 | func s3tos3Action(from, to BInfo, fileInfo FileInfo, semFile *semaphore.Weighted, wg *sync.WaitGroup, multipartUploadsList []*s3.MultipartUpload, targetObjectList []*s3.Object) error { 93 | if cfg.TransferMetadata { 94 | err := getMetadata(from, &fileInfo) 95 | if err != nil { 96 | return err 97 | } 98 | } 99 | 100 | // Check file exist on S3 Bucket and get uploadId 101 | uploadId, err := getUploadId(to.svc, fileInfo, multipartUploadsList, targetObjectList) 102 | if err != nil { 103 | return err 104 | } 105 | if uploadId == "NEXT" { 106 | log.Printf("...File exists and same size. Skipping target. s3://%s\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey)) 107 | return nil 108 | } 109 | 110 | semFile.Acquire(context.Background(), 1) //从线程信号池中获取,没有线程可用了就阻塞等待 111 | atomic.AddInt32(&runningGoroutines, 1) //线程计数 112 | wg.Add(1) 113 | go func() { 114 | defer wg.Done() 115 | defer semFile.Release(1) //释放线程信号池 116 | defer atomic.AddInt32(&runningGoroutines, -1) //线程计数 117 | 118 | if fileInfo.Size < cfg.ResumableThreshold { 119 | err := transferSmall(from, to, fileInfo) 120 | if err != nil { 121 | log.Println("Failed to transferSmall", err) 122 | return 123 | } 124 | } else { 125 | // >= ResumableThreshold 126 | log.Printf(" Start to transfer (>= ResumableThreshold) s3://%s, runningGoroutines: %d\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey), runningGoroutines) 127 | err := transferMultipart(from, to, uploadId, fileInfo) 128 | if err != nil { 129 | log.Println("Failed to multipartProccess", err) 130 | return 131 | } 132 | } 133 | log.Printf("***Successfully transfered s3://%s\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey)) 134 | atomic.AddInt64(&objectCount, 1) 135 | atomic.AddInt64(&sizeCount, fileInfo.Size) 136 | }() 137 | return nil 138 | } 139 | 140 | func transferSmall(from, to BInfo, fileInfo FileInfo) error { 141 | log.Printf(" Start to download (< ResumableThreshold) from %s/%s, runningGoroutines: %d\n", fileInfo.FromBucket, fileInfo.FromKey, runningGoroutines) 142 | buff := &aws.WriteAtBuffer{} 143 | inputDownload := &s3.GetObjectInput{ 144 | Bucket: aws.String(fileInfo.FromBucket), 145 | Key: aws.String(fileInfo.FromKey), 146 | } 147 | if from.requestPayer { 148 | inputDownload.RequestPayer = aws.String("requester") 149 | } 150 | _, err := from.downloader.Download(buff, inputDownload) 151 | if err != nil { 152 | log.Println("Error download from", from.url, err) 153 | return err 154 | } 155 | 156 | md5Hash := md5.Sum(buff.Bytes()) 157 | md5Str := base64.StdEncoding.EncodeToString(md5Hash[:]) 158 | log.Printf(" Start to upload (< ResumableThreshold) to %s/%s, runningGoroutines: %d\n", fileInfo.ToBucket, fileInfo.ToKey, runningGoroutines) 159 | inputUpload := &s3manager.UploadInput{ 160 | Bucket: aws.String(fileInfo.ToBucket), 161 | Key: aws.String(fileInfo.ToKey), 162 | Body: bytes.NewReader(buff.Bytes()), 163 | ContentMD5: aws.String(md5Str), 164 | } 165 | if to.storageClass != "" { 166 | inputUpload.StorageClass = aws.String(to.storageClass) 167 | } 168 | if to.ACL != "" { 169 | inputUpload.ACL = aws.String(to.ACL) 170 | } 171 | 172 | if fileInfo.Others.ContentType != nil && *fileInfo.Others.ContentType != "" { 173 | inputUpload.ContentType = fileInfo.Others.ContentType 174 | } 175 | if cfg.TransferMetadata { 176 | inputUpload.Metadata = fileInfo.Others.Metadata 177 | inputUpload.ContentEncoding = fileInfo.Others.ContentEncoding 178 | inputUpload.ContentLanguage = fileInfo.Others.ContentLanguage 179 | inputUpload.CacheControl = fileInfo.Others.CacheControl 180 | inputUpload.ContentDisposition = fileInfo.Others.ContentDisposition 181 | } 182 | _, err = to.uploader.Upload(inputUpload) 183 | if err != nil { 184 | log.Println("Error upload to", to.url, err) 185 | return err 186 | } 187 | return nil 188 | } 189 | 190 | func transferPart(from, to BInfo, partInfo PartInfo, wg *sync.WaitGroup, sem *semaphore.Weighted, uploadId string, partnumberList *[]PartInfo, partnumberListMutex *sync.Mutex) error { 191 | defer wg.Done() 192 | defer sem.Release(1) 193 | defer atomic.AddInt32(&runningGoroutines, -1) 194 | 195 | // Download part S3 API Call 196 | buffer, err := downloadPartAction(from.svc, partInfo) 197 | if err != nil { 198 | return err 199 | } 200 | // Upload part S3 API Call 201 | err = uploadPartAction(buffer, partInfo, to.svc, uploadId, partnumberList, partnumberListMutex) 202 | if err != nil { 203 | return err 204 | } 205 | return nil 206 | } 207 | -------------------------------------------------------------------------------- /sqs2trans.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "log" 7 | "net/url" 8 | "os" 9 | "path" 10 | "strings" 11 | "sync" 12 | "time" 13 | 14 | "github.com/aws/aws-sdk-go/aws" 15 | "github.com/aws/aws-sdk-go/service/s3" 16 | "github.com/aws/aws-sdk-go/service/sqs" 17 | "golang.org/x/sync/semaphore" 18 | ) 19 | 20 | type S3 struct { 21 | Bucket struct { 22 | Name string 23 | } 24 | Object struct { 25 | Key string 26 | Size int64 27 | } 28 | } 29 | 30 | type Record struct { 31 | EventVersion string 32 | EventSource string 33 | AwsRegion string 34 | EventTime string 35 | EventName string 36 | S3 S3 37 | } 38 | 39 | type Message struct { 40 | Records []Record 41 | Event string 42 | } 43 | 44 | // CompareBucket and send SQS message, not checking Head file (no TransferMetadata) 45 | func compareBucket(from, to BInfo, sqsSvc *sqs.SQS) error { 46 | var wg sync.WaitGroup 47 | var ignoreList []*string 48 | var fromList, toList []*s3.Object 49 | var err error 50 | var jobList []Message 51 | 52 | wg.Add(3) 53 | go func() { 54 | defer wg.Done() 55 | ignoreList = getIgnoreList() 56 | if err != nil { 57 | log.Fatalln(err) 58 | } 59 | }() 60 | go func() { 61 | defer wg.Done() 62 | fromList, err = getS3ObjectList(from) 63 | if err != nil { 64 | log.Fatalln(err) 65 | } 66 | }() 67 | go func() { 68 | defer wg.Done() 69 | toList, err = getS3ObjectList(to) 70 | if err != nil { 71 | log.Fatalln(err) 72 | } 73 | }() 74 | wg.Wait() 75 | // Compare each objects's name and size, pick up the delta 76 | jobList, sizeCount = compareS3Objects(fromList, toList, ignoreList, from, to) 77 | objectCount = int64(len(jobList)) 78 | 79 | // sqsSvc 如果nil就是不发SQS,只统计和写log 80 | if sqsSvc != nil { 81 | log.Printf("Found %d jobs to send SQS\n", objectCount) 82 | wg.Add(1) 83 | go func() { 84 | // Send SQS Message in batch 85 | defer wg.Done() 86 | log.Printf("Uploading jobs to SQS queue: %s\n", cfg.SQSUrl) 87 | err = sendSQS(jobList, sqsSvc) 88 | if err != nil { 89 | log.Println("Failed to send SQS messages", err) 90 | } 91 | }() 92 | } 93 | 94 | // Write jobList to file 95 | if cfg.JobListPath != "" { 96 | wg.Add(1) 97 | go func() { 98 | // Write Messages to file 99 | defer wg.Done() 100 | log.Println("Writing SQS messages to file", cfg.JobListPath) 101 | err = writeJobListFile(jobList, cfg.JobListPath) 102 | if err != nil { 103 | log.Println("Failed to write SQS messages to file", err) 104 | } 105 | }() 106 | } 107 | wg.Wait() 108 | return nil 109 | } 110 | 111 | // Compare S3 Objects, return delta list 112 | func compareS3Objects(fromList, toList []*s3.Object, ignoreList []*string, from, to BInfo) ([]Message, int64) { 113 | var listSizeCount = int64(0) 114 | delta := make([]Message, 0) 115 | fromMap := make(map[string]*s3.Object) 116 | toMap := make(map[string]*s3.Object) 117 | 118 | for _, obj := range fromList { 119 | if !isIgnored(*obj.Key, ignoreList) { 120 | fromMap[*obj.Key] = obj 121 | } 122 | } 123 | for _, obj := range toList { 124 | toMap[*obj.Key] = obj 125 | } 126 | 127 | // 只生成在From有而To没有的或Size不同的 128 | for key, fromObj := range fromMap { 129 | // 根据源和目标的 prefix 创建目标 key 130 | toKey := path.Join(to.prefix, strings.TrimPrefix(key, from.prefix)) 131 | 132 | toObj, ok := toMap[toKey] 133 | if !ok || *toObj.Size != *fromObj.Size { 134 | records := []Record{ 135 | { 136 | EventVersion: "2.1", 137 | EventSource: "aws:s3", 138 | AwsRegion: from.region, // You may need to update this 139 | EventTime: time.Now().Format(time.RFC3339), // Use current time 140 | EventName: "ObjectCreated:Put", // Assume object creation 141 | S3: S3{ 142 | Bucket: struct { 143 | Name string 144 | }{ 145 | Name: from.bucket, 146 | }, 147 | Object: struct { 148 | Key string 149 | Size int64 150 | }{ 151 | Key: key, 152 | Size: *fromObj.Size, 153 | }, 154 | }, 155 | }, 156 | } 157 | msg := Message{Records: records} 158 | delta = append(delta, msg) 159 | listSizeCount += *fromObj.Size 160 | } 161 | } 162 | return delta, listSizeCount 163 | } 164 | 165 | // Send SQS Message in batch with concurrency goroutines 166 | func sendSQS(jobList []Message, sqsSvc *sqs.SQS) error { 167 | var sqsBatch int 168 | var sqsMessage []*sqs.SendMessageBatchRequestEntry 169 | var wg sync.WaitGroup 170 | BatchSize := 10 171 | 172 | // Create a buffered channel to hold the jobs 要并发写SQS,所以用channel做buffer 173 | jobs := make(chan []*sqs.SendMessageBatchRequestEntry, cfg.NumWorkers) 174 | 175 | // Start the workers concurrency cfg.NumWorkers 176 | for i := 0; i < cfg.NumWorkers; i++ { 177 | wg.Add(1) 178 | go sendSQSWorker(i, jobs, &wg, sqsSvc) 179 | } 180 | 181 | // Send SQS Message in batch 182 | for i, job := range jobList { 183 | jobJSON, err := json.Marshal(job) 184 | if err != nil { 185 | return fmt.Errorf("failed to marshal job: %v", err) 186 | } 187 | sqsMessage = append(sqsMessage, &sqs.SendMessageBatchRequestEntry{ 188 | Id: aws.String(fmt.Sprint(i)), 189 | MessageBody: aws.String(string(jobJSON)), 190 | }) 191 | sqsBatch++ 192 | 193 | if sqsBatch == BatchSize || i == len(jobList)-1 { 194 | // Copy sqsMessage to prevent data race 195 | sqsMessageCopy := make([]*sqs.SendMessageBatchRequestEntry, len(sqsMessage)) 196 | copy(sqsMessageCopy, sqsMessage) 197 | 198 | // Send a job to the workers 199 | jobs <- sqsMessageCopy 200 | 201 | sqsBatch = 0 202 | sqsMessage = sqsMessage[:0] 203 | } 204 | } 205 | 206 | close(jobs) // close the jobs channel 207 | wg.Wait() 208 | log.Printf("Complete upload job to queue: %s\n", cfg.SQSUrl) 209 | return nil 210 | } 211 | 212 | func sendSQSWorker(id int, jobs <-chan []*sqs.SendMessageBatchRequestEntry, wg *sync.WaitGroup, sqsSvc *sqs.SQS) { 213 | defer wg.Done() 214 | var file *os.File 215 | var err error 216 | var logPath string 217 | 218 | // Prepare SQS sent log for writing a file, it is for backup 219 | if cfg.SQSSentLogName != "" { 220 | // Create SQS sent log file 221 | dateTimePrefix := time.Now().Format("20060102150405") 222 | logPath = fmt.Sprintf("%s-%s-sqs-sent-%d.log", cfg.SQSSentLogName, dateTimePrefix, id) 223 | file, err = os.Create(logPath) 224 | if err != nil { 225 | log.Printf("Failed to create SQS sent log file: %v\n", err) 226 | return 227 | } 228 | defer file.Close() 229 | } 230 | 231 | for job := range jobs { 232 | 233 | // Send Message to SQS 234 | _, err := sqsSvc.SendMessageBatch(&sqs.SendMessageBatchInput{ 235 | QueueUrl: aws.String(cfg.SQSUrl), 236 | Entries: job, 237 | }) 238 | if err != nil { 239 | log.Printf("Worker %d: Failed to send sqs message: %v; JobList: %v\n", id, err, job) 240 | continue 241 | } 242 | 243 | // Write SQS sent log to file for backup 244 | if cfg.SQSSentLogName != "" { 245 | for _, entry := range job { 246 | var messageBody map[string]interface{} 247 | err := json.Unmarshal([]byte(*entry.MessageBody), &messageBody) 248 | if err != nil { 249 | log.Printf("Worker %d: Failed to unmarshal MessageBody: %v\n", id, err) 250 | continue 251 | } 252 | messageBodyJson, err := json.Marshal(messageBody) 253 | if err != nil { 254 | log.Printf("Worker %d: Failed to marshal MessageBody to JSON: %v\n", id, err) 255 | continue 256 | } 257 | _, err = file.WriteString(string(messageBodyJson) + "\n") 258 | if err != nil { 259 | log.Printf("Worker %d: Failed to write SQS sent log: %v\n", id, err) 260 | continue 261 | } 262 | } 263 | } 264 | } 265 | log.Printf("Worker %d: Complete upload job to queue\n", id) 266 | if cfg.SQSSentLogName != "" { 267 | log.Printf("Worker %d: Complete write SQS sent log to file: %s\n", id, logPath) 268 | } 269 | } 270 | 271 | func writeJobListFile(jobList []Message, path string) error { 272 | 273 | // Check if the file exists 274 | if _, err := os.Stat(path); os.IsNotExist(err) { 275 | // If not, create the file 276 | file, err := os.Create(path) 277 | if err != nil { 278 | fmt.Println("Error creating file: ", err) 279 | return err 280 | } 281 | file.Close() 282 | } 283 | // Open the file in append mode 284 | file, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0644) 285 | if err != nil { 286 | fmt.Println("Error opening file: ", err) 287 | return err 288 | } 289 | defer file.Close() 290 | 291 | for _, job := range jobList { 292 | jobJSON, err := json.Marshal(job) 293 | if err != nil { 294 | return fmt.Errorf("failed to marshal job: %v", err) 295 | } 296 | _, err = file.WriteString(string(jobJSON) + "\n") 297 | if err != nil { 298 | return err 299 | } 300 | } 301 | log.Println("Complete writing job list to file", path) 302 | 303 | return nil 304 | } 305 | 306 | func consumeSQS(sqsSvc *sqs.SQS) error { 307 | var wgsqs sync.WaitGroup // 用于等待所有worker完成(适配s3tos3Action) 308 | semFile := semaphore.NewWeighted(int64(cfg.NumWorkers)) // 并发量为NumWorkers的信号量(适配s3tos3Action) 309 | ignoreList := getIgnoreList() 310 | for i := 0; i < cfg.NumWorkers; i++ { 311 | wgsqs.Add(1) 312 | go getSQSWorker(i, semFile, &wgsqs, sqsSvc, ignoreList) 313 | } 314 | wgsqs.Wait() 315 | return nil 316 | } 317 | 318 | func getSQSWorker(i int, semFile *semaphore.Weighted, wgsqs *sync.WaitGroup, sqsSvc *sqs.SQS, ignoreList []*string) { 319 | defer wgsqs.Done() 320 | sqsBatch := aws.Int64(10) 321 | var wg sync.WaitGroup 322 | 323 | for { 324 | resp, err := sqsSvc.ReceiveMessage(&sqs.ReceiveMessageInput{ 325 | QueueUrl: &cfg.SQSUrl, 326 | MaxNumberOfMessages: sqsBatch, 327 | }) 328 | if err != nil { 329 | log.Printf("Worker %d: Failed to get SQS. Wait for 5 seconds. ERR: %v\n", i, err) 330 | time.Sleep(time.Duration(cfg.RetryDelay) * time.Second) 331 | continue 332 | } 333 | 334 | if len(resp.Messages) == 0 { 335 | log.Printf("Worker %d: No message in queue available, wait...", i) 336 | time.Sleep(60 * time.Second) 337 | continue 338 | } 339 | log.Printf("Worker %d: Received %d messages\n", i, len(resp.Messages)) 340 | // 对Batch Message中的每个Record进行处理 341 | for _, message := range resp.Messages { 342 | var msg Message 343 | var transferErr error 344 | transferErr = json.Unmarshal([]byte(*message.Body), &msg) 345 | if transferErr != nil { 346 | log.Printf("Worker %d: Failed to parse SQS message. ERR: %v\n", i, err) 347 | continue 348 | } 349 | 350 | for _, record := range msg.Records { 351 | // 根据源和目标的 prefix 创建目标 key 352 | // Decode the key from URL format 避免中间出现 + 号的情况 353 | var decodedKey string 354 | decodedKey, transferErr = url.QueryUnescape(record.S3.Object.Key) 355 | if transferErr != nil { 356 | log.Printf("Failed to decode key: %v. ERR: %v\n", record.S3.Object.Key, err) 357 | continue // TODO: 这里跳出去之后会跑到SQS Del去了 358 | } 359 | 360 | // ignore list 361 | if isIgnored(decodedKey, ignoreList) { 362 | log.Printf("Skipping ignored key in ignoreList %s\n", decodedKey) 363 | continue 364 | } 365 | 366 | toKey := path.Join(to.prefix, strings.TrimPrefix(decodedKey, from.prefix)) 367 | fileInfo := FileInfo{ 368 | FromKey: decodedKey, 369 | FromBucket: record.S3.Bucket.Name, 370 | Size: record.S3.Object.Size, 371 | ToBucket: to.bucket, 372 | ToKey: toKey, 373 | } 374 | 375 | // Transfer object 376 | if strings.HasPrefix(record.EventName, "ObjectCreated:") { 377 | targetObjectList := make([]*s3.Object, 0) // 按照SQS消息来传输,将忽略targetObjectList 378 | multipartUploadsList := make([]*s3.MultipartUpload, 0) 379 | if fileInfo.Size >= cfg.ResumableThreshold { 380 | multipartUploadsList, _ = getMultipartUploadList(to.svc, fileInfo.ToBucket, fileInfo.ToKey) // 查当前key是否有未完成的Multipart Upload 381 | } 382 | transferErr = s3tos3Action(from, to, fileInfo, semFile, &wg, multipartUploadsList, targetObjectList) 383 | wg.Wait() 384 | if transferErr != nil { 385 | log.Printf("Worker %d: Failed to transfer object: %v\n", i, err) 386 | continue // TODO: 这里跳出去之后会跑到SQS Del去了 387 | } 388 | } 389 | // Delete object 390 | if strings.HasPrefix(record.EventName, "ObjectRemoved:") { 391 | transferErr = delObjcet(to.svc, fileInfo.ToBucket, fileInfo.ToKey) 392 | } 393 | } 394 | 395 | // Skip processing for "s3:TestEvent" 396 | if msg.Event == "s3:TestEvent" { 397 | fmt.Println("Skipping Test Event") 398 | } 399 | // Delete SQS message 400 | if transferErr == nil { 401 | err = delSQS(message, sqsSvc) 402 | if err != nil { 403 | log.Printf("Worker %d: Failed to delete SQS message: %v\n", i, err) 404 | continue 405 | } 406 | } 407 | } 408 | } 409 | } 410 | 411 | func delSQS(message *sqs.Message, sqsSvc *sqs.SQS) error { 412 | _, err := sqsSvc.DeleteMessage(&sqs.DeleteMessageInput{ 413 | QueueUrl: &cfg.SQSUrl, 414 | ReceiptHandle: message.ReceiptHandle, 415 | }) 416 | if err != nil { 417 | return err 418 | } 419 | return nil 420 | } 421 | 422 | func delObjcet(svc *s3.S3, bucket, key string) error { 423 | _, err := svc.DeleteObject(&s3.DeleteObjectInput{ 424 | Bucket: aws.String(bucket), 425 | Key: aws.String(key), 426 | }) 427 | if err != nil { 428 | return err 429 | } 430 | return nil 431 | } 432 | -------------------------------------------------------------------------------- /upload.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "crypto/md5" 7 | "encoding/base64" 8 | "errors" 9 | "io" 10 | "log" 11 | "mime" 12 | "os" 13 | "path" 14 | "path/filepath" 15 | "strings" 16 | "sync" 17 | "sync/atomic" 18 | 19 | "github.com/aws/aws-sdk-go/aws" 20 | "github.com/aws/aws-sdk-go/service/s3" 21 | "github.com/aws/aws-sdk-go/service/s3/s3manager" 22 | "golang.org/x/sync/semaphore" 23 | ) 24 | 25 | func startUpload(from, to BInfo) error { 26 | ignoreList := getIgnoreList() 27 | var wg sync.WaitGroup 28 | semFile := semaphore.NewWeighted(int64(cfg.NumWorkers)) // 并发量为numWorkers的信号量 for file 29 | 30 | // List Target S3 31 | targetObjectList := make([]*s3.Object, 0) 32 | var err error 33 | if cfg.ListTarget && !cfg.SkipCompare { 34 | targetObjectList, err = getS3ObjectList(to) // 获取目标 S3 桶中的文件列表 35 | if err != nil { 36 | return err 37 | } 38 | } 39 | 40 | // Listing multipart uploads ID 41 | multipartUploadsList, err := getMultipartUploadList(to.svc, to.bucket, to.prefix) 42 | if err != nil { 43 | return err 44 | } 45 | 46 | // Walk through local path for uploading 47 | err = filepath.Walk(from.url, func(thispath string, info os.FileInfo, err error) error { 48 | if err != nil { 49 | log.Println("Failed to access path", thispath, err) 50 | return err 51 | } 52 | // Skip if key in ignoreList 53 | if isIgnored(thispath, ignoreList) { 54 | log.Println("...Skiping ignored key in ignoreList", thispath) 55 | } 56 | // Skip if the path is a directory 57 | if info.IsDir() { 58 | return nil 59 | } 60 | combinedKey := path.Join(to.prefix, filepath.ToSlash(strings.TrimPrefix(thispath, filepath.Dir(from.url)))) 61 | contentType := mime.TypeByExtension(filepath.Ext(thispath)) 62 | fileInfo := FileInfo{ 63 | ToKey: combinedKey, 64 | ToBucket: to.bucket, 65 | Size: info.Size(), 66 | Others: MetaStruct{ContentType: &contentType}, 67 | } 68 | 69 | // Check file exist on S3 Bucket and get uploadId 70 | uploadId, err := getUploadId(to.svc, fileInfo, multipartUploadsList, targetObjectList) 71 | if err != nil { 72 | return err 73 | } 74 | if uploadId == "NEXT" { 75 | log.Printf("...File already exists. Skipping... s3://%s\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey)) 76 | return nil 77 | } 78 | 79 | semFile.Acquire(context.Background(), 1) //从线程信号池中获取,没有线程可用了就阻塞等待 80 | atomic.AddInt32(&runningGoroutines, 1) //线程计数 81 | wg.Add(1) 82 | go func(thispath string, info os.FileInfo, uploadId string) { 83 | defer wg.Done() 84 | defer semFile.Release(1) 85 | defer atomic.AddInt32(&runningGoroutines, -1) 86 | 87 | fileInfo.File, err = os.Open(thispath) 88 | if err != nil { 89 | log.Println("Failed to open file", thispath, err) 90 | return 91 | } 92 | defer fileInfo.File.Close() 93 | 94 | if info.Size() < cfg.ResumableThreshold { 95 | log.Printf(" Start to upload (< ResumableThreshold): %s to s3://%s, runningGoroutines: %d\n", thispath, path.Join(fileInfo.ToBucket, fileInfo.ToKey), runningGoroutines) 96 | err := uploadSmall(fileInfo, thispath, info, uploadId) 97 | if err != nil { 98 | log.Printf("Failed to uploadSmall: s3://%s, %v\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey), err) 99 | return 100 | } 101 | // Success upload 102 | } else { 103 | // info.Size() >= ResumableThreshold Use multipart upload for large files resumable upload 104 | log.Printf(" Start to upload (>= ResumableThreshold): %s to s3://%s, runningGoroutines: %d\n", thispath, path.Join(fileInfo.ToBucket, fileInfo.ToKey), runningGoroutines) 105 | err := transferMultipart(from, to, uploadId, fileInfo) 106 | if err != nil { 107 | log.Printf("Failed to multipartProccess: s3://%s, %v\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey), err) 108 | return 109 | } 110 | } 111 | log.Printf("***Successfully uploaded: s3://%s\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey)) 112 | atomic.AddInt64(&objectCount, 1) 113 | atomic.AddInt64(&sizeCount, info.Size()) 114 | }(thispath, info, uploadId) 115 | return nil 116 | }) 117 | wg.Wait() 118 | if err != nil { 119 | log.Println("Failed to walk directory", err) 120 | return err 121 | } 122 | return err 123 | } 124 | 125 | func uploadSmall(fileInfo FileInfo, thispath string, info os.FileInfo, uploadId string) error { 126 | buff, err := io.ReadAll(fileInfo.File) 127 | if err != nil { 128 | log.Println("Failed to read file", thispath, err) 129 | return err 130 | } 131 | 132 | md5Hash := md5.Sum(buff) 133 | md5Str := base64.StdEncoding.EncodeToString(md5Hash[:]) 134 | 135 | inputUpload := &s3manager.UploadInput{ 136 | Bucket: aws.String(fileInfo.ToBucket), 137 | Key: aws.String(fileInfo.ToKey), 138 | Body: bytes.NewReader(buff), 139 | ContentMD5: aws.String(md5Str), 140 | } 141 | if to.storageClass != "" { 142 | inputUpload.StorageClass = aws.String(to.storageClass) 143 | } 144 | if to.ACL != "" { 145 | inputUpload.ACL = aws.String(to.ACL) 146 | } 147 | if fileInfo.Others.ContentType != nil && *fileInfo.Others.ContentType != "" { 148 | inputUpload.ContentType = fileInfo.Others.ContentType 149 | } 150 | _, err = to.uploader.Upload(inputUpload) 151 | if err != nil { 152 | log.Printf("Failed to upload file s3://%s, err: %v\n", path.Join(fileInfo.ToBucket, fileInfo.ToKey), err) 153 | return err 154 | } 155 | 156 | return nil 157 | } 158 | 159 | func transferMultipart(from, to BInfo, uploadId string, fileInfo FileInfo) error { 160 | semPart := semaphore.NewWeighted(int64(cfg.NumWorkers * 4)) // 并发量为numWorkers的信号量 for parts 161 | var partnumberList []PartInfo 162 | var partnumberListMutex sync.Mutex 163 | var fileMutex sync.Mutex 164 | var err error 165 | 166 | if uploadId == "" { 167 | inputCreate := &s3.CreateMultipartUploadInput{ 168 | Bucket: aws.String(fileInfo.ToBucket), 169 | Key: aws.String(fileInfo.ToKey), 170 | } 171 | if to.storageClass != "" { 172 | inputCreate.StorageClass = aws.String(to.storageClass) 173 | } 174 | if to.ACL != "" { 175 | inputCreate.ACL = aws.String(to.ACL) 176 | } 177 | if fileInfo.Others.ContentType != nil && *fileInfo.Others.ContentType != "" { 178 | inputCreate.ContentType = fileInfo.Others.ContentType 179 | } 180 | if cfg.TransferMetadata { 181 | inputCreate.Metadata = fileInfo.Others.Metadata 182 | inputCreate.ContentEncoding = fileInfo.Others.ContentEncoding 183 | inputCreate.CacheControl = fileInfo.Others.CacheControl 184 | inputCreate.ContentLanguage = fileInfo.Others.ContentLanguage 185 | inputCreate.ContentDisposition = fileInfo.Others.ContentDisposition 186 | } 187 | resp, err := to.svc.CreateMultipartUpload(inputCreate) 188 | if err != nil { 189 | log.Println("Failed to create multipart upload", fileInfo.ToBucket, fileInfo.ToKey, err) 190 | return err 191 | } 192 | uploadId = *resp.UploadId 193 | } else { 194 | partnumberList, err = checkPartnumberList(to.svc, fileInfo.ToBucket, fileInfo.ToKey, uploadId) 195 | if err != nil { 196 | log.Println("Failed to get part number list", fileInfo.ToBucket, fileInfo.ToKey, err) 197 | _ = partnumberList 198 | return err 199 | } 200 | } 201 | 202 | indexList, chunkSizeAuto := split(fileInfo, cfg.ChunkSize) 203 | 204 | var wg2 sync.WaitGroup 205 | for i, offset := range indexList { 206 | // 检查i是否在partnumberList里面 207 | found := false 208 | for _, value := range partnumberList { 209 | if int64(i+1) == value.PartNumber { 210 | found = true 211 | break 212 | } 213 | } 214 | // 如果已经在partnumberList则跳过到下一个i 215 | if found { 216 | continue 217 | } 218 | 219 | // 不在partnumberList,上传该part 220 | size := chunkSizeAuto 221 | if offset+chunkSizeAuto > fileInfo.Size { // 如果是最后一个分片则给实际size 222 | size = fileInfo.Size - offset 223 | } 224 | partInfo := PartInfo{ 225 | ToKey: fileInfo.ToKey, 226 | ToBucket: fileInfo.ToBucket, 227 | FromKey: fileInfo.FromKey, 228 | FromBucket: fileInfo.FromBucket, 229 | PartNumber: int64(i + 1), 230 | Size: size, 231 | Offset: offset, 232 | TotalParts: int64(len(indexList)), 233 | } 234 | 235 | semPart.Acquire(context.Background(), 1) //从线程池中获取,没有线程可用了就阻塞等待 236 | atomic.AddInt32(&runningGoroutines, 1) //线程计数 237 | wg2.Add(1) 238 | if fileInfo.File != nil { 239 | go uploadPart(to.svc, partInfo, &wg2, semPart, uploadId, &partnumberList, &partnumberListMutex, fileInfo.File, &fileMutex) 240 | } else { 241 | // s3tos3 part 242 | go transferPart(from, to, partInfo, &wg2, semPart, uploadId, &partnumberList, &partnumberListMutex) 243 | } 244 | 245 | } 246 | wg2.Wait() 247 | if len(indexList) == len(partnumberList) { 248 | err := completeUpload(to.svc, uploadId, fileInfo.ToBucket, fileInfo.ToKey, &partnumberList) 249 | if err != nil { 250 | log.Println("Failed to complete upload", err, fileInfo.ToBucket, fileInfo.ToKey) 251 | return err 252 | } 253 | // Success complete upload 254 | } else { 255 | log.Println("Failed to complete upload, len(indexList) != len(partnumberList)", fileInfo.ToBucket, fileInfo.ToKey, len(indexList), len(partnumberList)) 256 | return errors.New("failed to complete upload, len(indexList) != len(partnumberList)") 257 | } 258 | return nil 259 | } 260 | 261 | func uploadPart(svc *s3.S3, partInfo PartInfo, wg *sync.WaitGroup, sem *semaphore.Weighted, uploadId string, partnumberList *[]PartInfo, partnumberListMutex *sync.Mutex, file *os.File, fileMutex *sync.Mutex) error { 262 | defer wg.Done() 263 | defer sem.Release(1) 264 | defer atomic.AddInt32(&runningGoroutines, -1) 265 | // log.Printf("-->Uploading s3://%s, part:%d/%d, runningGoroutines: %d\n", path.Join(partInfo.ToBucket, partInfo.ToKey), partInfo.PartNumber, partInfo.TotalParts, runningGoroutines) 266 | 267 | // 创建一个分片大小的缓冲区 268 | fileMutex.Lock() 269 | file.Seek(partInfo.Offset, 0) // 定位到需要读取的文件部分 270 | buffer := make([]byte, partInfo.Size) 271 | _, err := io.ReadFull(file, buffer) 272 | if err != nil { 273 | log.Println("Failed to read full buffer from file", partInfo.ToBucket, partInfo.ToKey, partInfo.PartNumber, err) 274 | return err 275 | } 276 | fileMutex.Unlock() 277 | 278 | // Upload part S3 API Call 279 | err = uploadPartAction(buffer, partInfo, svc, uploadId, partnumberList, partnumberListMutex) 280 | if err != nil { 281 | return err 282 | } 283 | return nil 284 | } 285 | 286 | func uploadPartAction(buff []byte, partInfo PartInfo, svc *s3.S3, uploadId string, partnumberList *[]PartInfo, partnumberListMutex *sync.Mutex) error { 287 | log.Printf("-->Uploading part s3://%s, part:%d/%d, runningGoroutines: %d\n", path.Join(partInfo.ToBucket, partInfo.ToKey), partInfo.PartNumber, partInfo.TotalParts, runningGoroutines) 288 | // 计算分片数据的MD5哈希值 289 | md5Hash := md5.Sum(buff) 290 | md5Str := base64.StdEncoding.EncodeToString(md5Hash[:]) 291 | 292 | // 上传分片 293 | result, err := svc.UploadPart(&s3.UploadPartInput{ 294 | Bucket: aws.String(partInfo.ToBucket), 295 | Key: aws.String(partInfo.ToKey), 296 | PartNumber: aws.Int64(int64(partInfo.PartNumber)), 297 | UploadId: aws.String(uploadId), 298 | Body: bytes.NewReader(buff), 299 | ContentLength: aws.Int64(int64(partInfo.Size)), 300 | ContentMD5: aws.String(md5Str), 301 | }) 302 | if err != nil { 303 | log.Printf("Failed to upload part s3://%s, part:%d/%d, err: %v\n", path.Join(partInfo.ToBucket, partInfo.ToKey), partInfo.PartNumber, partInfo.TotalParts, err) 304 | return nil 305 | } 306 | partnumberListMutex.Lock() 307 | *partnumberList = append(*partnumberList, PartInfo{ 308 | PartNumber: partInfo.PartNumber, 309 | Etag: *result.ETag, 310 | }) 311 | partnumberListMutex.Unlock() 312 | log.Printf("===Uploaded part s3://%s, part:%d/%d\n", path.Join(partInfo.ToBucket, partInfo.ToKey), partInfo.PartNumber, partInfo.TotalParts) 313 | return nil 314 | } 315 | 316 | func completeUpload(svc *s3.S3, uploadId, bucket, key string, partnumberList *[]PartInfo) error { 317 | completedParts := []*s3.CompletedPart{} 318 | var i int64 319 | for i = 1; i <= int64(len(*partnumberList)); i++ { 320 | for _, partNumber := range *partnumberList { 321 | if i == partNumber.PartNumber { 322 | completedParts = append(completedParts, &s3.CompletedPart{ 323 | ETag: &partNumber.Etag, 324 | PartNumber: &partNumber.PartNumber, 325 | }) 326 | break 327 | } 328 | } 329 | } 330 | 331 | _, err := svc.CompleteMultipartUpload(&s3.CompleteMultipartUploadInput{ 332 | Bucket: aws.String(bucket), 333 | Key: aws.String(key), 334 | UploadId: aws.String(uploadId), 335 | MultipartUpload: &s3.CompletedMultipartUpload{ 336 | Parts: completedParts, 337 | }, 338 | }) 339 | return err 340 | } 341 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "database/sql" 6 | "fmt" 7 | "log" 8 | "math" 9 | "os" 10 | "strings" 11 | "time" 12 | 13 | "github.com/google/uuid" 14 | ) 15 | 16 | func split(fileInfo FileInfo, chunkSize int64) (indexList []int64, actualChunkSize int64) { 17 | partNumber := int64(1) 18 | indexList = []int64{0} 19 | 20 | if int64(math.Ceil(float64(fileInfo.Size)/float64(chunkSize))) > 10000 { 21 | chunkSize = fileInfo.Size/10000 + 1024 // 对于大于10000分片的大文件,自动调整Chunksize 22 | } 23 | 24 | for chunkSize*partNumber < fileInfo.Size { // 如果刚好是"=",则无需再分下一part,所以这里不能用"<=" 25 | indexList = append(indexList, chunkSize*partNumber) 26 | partNumber += 1 27 | } 28 | return indexList, chunkSize 29 | } 30 | 31 | func withRetries(fn RetryFunc) error { 32 | var err error 33 | for i := 0; i < cfg.MaxRetries; i++ { 34 | err = fn() 35 | if err == nil { 36 | break 37 | } 38 | log.Println("Failed to execute function: ", err, ". Retrying...") 39 | time.Sleep(time.Duration(int64(math.Pow(2, float64(i)))) * time.Second) 40 | } 41 | return err 42 | } 43 | 44 | func contains(s []int, e int) bool { 45 | for _, a := range s { 46 | if a == e { 47 | return true 48 | } 49 | } 50 | return false 51 | } 52 | 53 | func ByteCountSI(b int64) string { 54 | const unit = 1024 55 | if b < unit { 56 | return fmt.Sprintf("%dBytes", b) 57 | } 58 | div, exp := int64(unit), 0 59 | for n := b / unit; n >= unit; n /= unit { 60 | div *= unit 61 | exp++ 62 | } 63 | return fmt.Sprintf("%.1f%cBytes", float64(b)/float64(div), "KMGTPE"[exp]) 64 | } 65 | 66 | func getIgnoreList() []*string { 67 | log.Printf("Checking ignore files list in %s\n", cfg.IgnoreListPath) 68 | ignoreList := []*string{} 69 | 70 | _, err := os.Stat(cfg.IgnoreListPath) 71 | if err != nil { 72 | if os.IsNotExist(err) { 73 | log.Printf("No ignore list in path %s\n", cfg.IgnoreListPath) 74 | } else { 75 | log.Println(err) 76 | } 77 | } else { 78 | file, err := os.Open(cfg.IgnoreListPath) 79 | if err != nil { 80 | log.Println(err) 81 | } 82 | defer file.Close() 83 | 84 | scanner := bufio.NewScanner(file) 85 | for scanner.Scan() { 86 | prefix := strings.TrimPrefix(scanner.Text(), "/") 87 | ignoreList = append(ignoreList, &prefix) 88 | } 89 | if err := scanner.Err(); err != nil { 90 | log.Println(err) 91 | } 92 | log.Printf("Found ignore files list with prefix Length: %d, in %s", len(ignoreList), cfg.IgnoreListPath) 93 | } 94 | return ignoreList 95 | } 96 | 97 | func isIgnored(key string, ignoreList []*string) bool { 98 | for _, prefix := range ignoreList { 99 | if strings.HasPrefix(key, *prefix) { 100 | return true 101 | } 102 | } 103 | return false 104 | } 105 | 106 | func getDatabase() (*sql.DB, error) { 107 | var database *sql.DB 108 | var err error 109 | err = withRetries(func() error { 110 | database, err = sql.Open("sqlite3", cfg.DBPath) 111 | if err != nil { 112 | fmt.Println("Failed to connect to sqlite3", err) 113 | return err 114 | } 115 | statement, err := database.Prepare("CREATE TABLE IF NOT EXISTS download (ID TEXT PRIMARY KEY, key TEXT, bucket TEXT, part INT)") 116 | if err != nil { 117 | fmt.Println("Failed to prepare getDatabase statement: ", err) 118 | return err 119 | } 120 | _, err = statement.Exec() 121 | if err != nil { 122 | return err 123 | } 124 | return nil 125 | }) 126 | if err != nil { 127 | return nil, err 128 | } 129 | return database, nil 130 | } 131 | 132 | func recordDownloadPart(partInfo PartInfo) { 133 | err := withRetries(func() error { 134 | database, err := getDatabase() 135 | if err != nil { 136 | fmt.Println("Failed to get sqlite3 database", err) 137 | return err 138 | } 139 | defer database.Close() 140 | uuid, err := uuid.NewRandom() 141 | if err != nil { 142 | return err 143 | } 144 | statement, err := database.Prepare("INSERT INTO download (ID, key, bucket, part) VALUES (?, ?, ?, ?)") 145 | if err != nil { 146 | fmt.Println("Failed to prepare recordDownloadPart statement: ", err) 147 | return err 148 | } 149 | _, execErr := statement.Exec(uuid, partInfo.FromKey, partInfo.FromBucket, partInfo.PartNumber) 150 | if execErr != nil { 151 | fmt.Println("Failed to execute recordDownloadPart statement: ", execErr, ". Retrying...") 152 | } 153 | return execErr 154 | }) 155 | if err != nil { 156 | fmt.Println("Failed to execute recordDownloadPart statement after retries: ", err) 157 | return 158 | } 159 | } 160 | 161 | func getDownloadedParts(fileInfo FileInfo) ([]int, error) { 162 | var partnumberList []int 163 | err := withRetries(func() error { 164 | database, err := getDatabase() 165 | if err != nil { 166 | fmt.Println("Failed to get sqlite3 database", err) 167 | return err 168 | } 169 | defer database.Close() 170 | partnumberList = []int{} 171 | rows, err := database.Query("SELECT part FROM download WHERE key = ? AND bucket = ? ORDER BY part ASC", fileInfo.FromKey, fileInfo.FromBucket) 172 | if err != nil { 173 | fmt.Println("Failed to prepare getDownloadedParts statement: ", err) 174 | return err 175 | } 176 | defer rows.Close() 177 | var part int 178 | for rows.Next() { 179 | err := rows.Scan(&part) 180 | if err != nil { 181 | fmt.Println("Failed to scan row: ", err) 182 | return err 183 | } 184 | partnumberList = append(partnumberList, part) 185 | } 186 | if err = rows.Err(); err != nil { 187 | fmt.Println("Rows iteration error: ", err) 188 | return err 189 | } 190 | return nil 191 | }) 192 | if err != nil { 193 | return nil, err 194 | } 195 | return partnumberList, nil 196 | } 197 | 198 | func deleteDownloadParts(fileInfo FileInfo) error { 199 | err := withRetries(func() error { 200 | database, err := getDatabase() 201 | if err != nil { 202 | fmt.Println("Failed to get sqlite3 database: ", err) 203 | return err 204 | } 205 | defer database.Close() 206 | statement, err := database.Prepare("DELETE FROM download WHERE key = ? AND bucket = ?") 207 | if err != nil { 208 | fmt.Println("Failed to prepare deleteDownloadParts statement: ", err) 209 | return err 210 | } 211 | _, err = statement.Exec(fileInfo.FromKey, fileInfo.FromBucket) 212 | if err != nil { 213 | fmt.Println("Failed to execute deleteDownloadParts statement: ", err) 214 | return err 215 | } 216 | return nil 217 | }) 218 | return err 219 | } 220 | --------------------------------------------------------------------------------