├── tests
└── simple
│ ├── version
│ ├── .storage
│ ├── 7209184710918930433
│ ├── 7209184516538105857
│ ├── 7209184677322555393
│ └── 7209184749493944321
│ ├── bucket-b.bucket.meta
│ └── bucket-a.bucket.meta
├── project
├── build.properties
└── plugins.sbt
├── .gitignore
├── src
├── main
│ ├── resources
│ │ └── logback.xml
│ └── scala
│ │ └── io
│ │ └── github
│ │ └── starofall
│ │ └── s3hypersync
│ │ ├── MainApp.scala
│ │ ├── SyncModel.scala
│ │ ├── SyncLogging.scala
│ │ ├── SyncUtil.scala
│ │ ├── SyncS3Settings.scala
│ │ ├── SyncCommand.scala
│ │ ├── S3Connector.scala
│ │ ├── SyncStatistics.scala
│ │ ├── JobDefinition.scala
│ │ └── PekkoFileSyncCompareStage.scala
└── test
│ ├── resources
│ └── logback-test.xml
│ └── scala
│ └── io
│ └── github
│ └── starofall
│ └── s3hypersync
│ └── SyncCommandTest.scala
├── LICENSE
└── README.md
/tests/simple/version:
--------------------------------------------------------------------------------
1 | 1
--------------------------------------------------------------------------------
/tests/simple/.storage/7209184710918930433:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/simple/.storage/7209184516538105857:
--------------------------------------------------------------------------------
1 | test
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.10.0
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | // packs app into a single jar
2 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0")
3 |
--------------------------------------------------------------------------------
/tests/simple/.storage/7209184677322555393:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Starofall/S3HyperSync/HEAD/tests/simple/.storage/7209184677322555393
--------------------------------------------------------------------------------
/tests/simple/.storage/7209184749493944321:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Starofall/S3HyperSync/HEAD/tests/simple/.storage/7209184749493944321
--------------------------------------------------------------------------------
/tests/simple/bucket-b.bucket.meta:
--------------------------------------------------------------------------------
1 | {"objectMap":{},"creationDate":1720449254106,"region":"local","uploads":{},"versioningEnabled":null,"bucketName":"bucket-b","tagging":null,"acl":null,"policy":null,"replication":null,"encryption":null}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
4 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
5 | hs_err_pid*
6 |
7 | /.bsp/**
8 | /.idea/**
9 | /project/project/**
10 | /project/target/**
11 | /target/**
12 |
--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | %-5level | %msg%n
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | %-5level | %msg%n
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/MainApp.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import io.github.starofall.s3hypersync.SyncLogging.Logger
4 | import io.github.starofall.s3hypersync.SyncUtil._
5 | import org.apache.pekko.actor.ActorSystem
6 |
7 | import scala.concurrent.ExecutionContextExecutor
8 |
9 |
10 | object MainApp extends Logger {
11 |
12 | /** main method for CLI interaction */
13 | def main(args: Array[String]): Unit = {
14 | val conf = new JobDefinition(args.toIndexedSeq)
15 | implicit val actorSystem: ActorSystem = ActorSystem(
16 | "SyncSystem", buildConfig(conf))
17 | implicit val exc : ExecutionContextExecutor = actorSystem.dispatcher
18 | SyncLogging.initLogger(conf)
19 | addTimeoutIfNeeded(conf)
20 | new SyncCommand(conf)
21 | .runSyncJob()
22 | .onComplete(handleFinalResult)
23 | }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Starofall
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/SyncModel.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | object SyncModel {
4 |
5 | /** Represents the status of sync */
6 | trait SyncStatus
7 |
8 | /** Describes the sync status of a given file */
9 | case class FileSyncState(status: SyncStatus,
10 | file: SyncFile)
11 |
12 | /**
13 | * references a file on s3
14 | * @param bucket the bucket this file exists on
15 | * @param key the full s3 key
16 | * @param size the byteSize of the file
17 | * @param relativeKey the key relative to the root dir of the job
18 | */
19 | case class SyncFile(bucket: String,
20 | key: String,
21 | size: Long,
22 | relativeKey: String)
23 |
24 | /** Object contains statuses of sync */
25 | object SyncStatus {
26 |
27 | /** Object exists in S3 */
28 | case object Exists extends SyncStatus
29 |
30 | /** Size of the object has changed in S3 */
31 | case object SizeChanged extends SyncStatus
32 |
33 | /** Object is missing in S3 */
34 | case object Missing extends SyncStatus
35 |
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/tests/simple/bucket-a.bucket.meta:
--------------------------------------------------------------------------------
1 | {"objectMap":{"dummy.flac":{"versionedObjectMap":{"7209184677322555392":{"etag":"a0c4882a57074887748f62eaf4bcbc94","contentType":"audio/x-flac","creationDate":1720449404961,"size":716938,"fileId":7209184677322555393,"tagging":null,"userMetadata":{},"deleted":false}},"virtualVersion":"7209184677322555392"},"subdir/":{"versionedObjectMap":{"7209184710918930432":{"etag":"d41d8cd98f00b204e9800998ecf8427e","contentType":null,"creationDate":1720449412971,"size":0,"fileId":7209184710918930433,"tagging":null,"userMetadata":{},"deleted":false}},"virtualVersion":"7209184710918930432"},"subdir/dummy2.flac":{"versionedObjectMap":{"7209184749493944320":{"etag":"a0c4882a57074887748f62eaf4bcbc94","contentType":"audio/x-flac","creationDate":1720449422168,"size":716938,"fileId":7209184749493944321,"tagging":null,"userMetadata":{},"deleted":false}},"virtualVersion":"7209184749493944320"},"test.file":{"versionedObjectMap":{"7209184516538105856":{"etag":"098f6bcd4621d373cade4e832627b4f6","contentType":null,"creationDate":1720449366627,"size":4,"fileId":7209184516538105857,"tagging":null,"userMetadata":{},"deleted":false}},"virtualVersion":"7209184516538105856"}},"creationDate":1720449248883,"region":"local","uploads":{},"versioningEnabled":null,"bucketName":"bucket-a","tagging":null,"acl":null,"policy":null,"replication":null,"encryption":null}
--------------------------------------------------------------------------------
/src/test/scala/io/github/starofall/s3hypersync/SyncCommandTest.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import com.robothy.s3.rest.LocalS3
4 | import com.robothy.s3.rest.bootstrap.LocalS3Mode
5 | import org.apache.pekko.actor.ActorSystem
6 | import org.scalatest.funsuite.AnyFunSuiteLike
7 | import org.scalatest.matchers.must.Matchers.convertToAnyMustWrapper
8 |
9 | import scala.concurrent.duration.DurationInt
10 | import scala.concurrent.{Await, ExecutionContextExecutor}
11 |
12 | class SyncCommandTest extends AnyFunSuiteLike {
13 |
14 | implicit val actorSystem: ActorSystem = ActorSystem("TestSyncSystem")
15 | implicit val exc : ExecutionContextExecutor = actorSystem.dispatcher
16 |
17 | def createConfig(dryRun: Boolean): JobDefinition = {
18 | new JobDefinition(List(
19 | "--source-key", "DUMMY",
20 | "--source-secret", "DUMMY",
21 | "--source-region", "region",
22 | "--source-path-style",
23 | "--source-endpoint", "http://localhost:19090",
24 | "--target-key", "DUMMY2",
25 | "--target-secret", "DUMMY2",
26 | "--target-region", "region",
27 | "--target-endpoint", "http://localhost:19090",
28 | "--target-path-style",
29 | "--source-bucket", "bucket-a",
30 | "--target-bucket", "bucket-b") ++ (if (dryRun) List("--dry-run") else List()))
31 | }
32 |
33 |
34 | test("simple sync") {
35 | val localS3 = LocalS3.builder
36 | .mode(LocalS3Mode.IN_MEMORY)
37 | .dataPath("./tests/simple")
38 | .port(19090).build
39 | localS3.start()
40 |
41 | val dryRunConfig = createConfig(dryRun = true)
42 | SyncLogging.initLogger(dryRunConfig)
43 | val dryRunCommand = new SyncCommand(dryRunConfig)
44 | dryRunCommand.statistics.filesScanned.get() mustBe 0
45 | Await.result(dryRunCommand.runSyncJob(), 30.seconds)
46 | dryRunCommand.statistics.filesScanned.get() mustBe 3
47 |
48 | val syncConfig = createConfig(dryRun = false)
49 | val syncCommand = new SyncCommand(syncConfig)
50 | Await.result(syncCommand.runSyncJob(), 30.seconds)
51 | syncCommand.statistics.filesCopied.get() mustBe 3
52 |
53 | val checkConfig = createConfig(dryRun = true)
54 | SyncLogging.initLogger(checkConfig)
55 | val checkCommand = new SyncCommand(checkConfig)
56 | Await.result(checkCommand.runSyncJob(), 30.seconds)
57 | checkCommand.statistics.filesMissing.get() mustBe 0
58 |
59 | localS3.shutdown()
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/SyncLogging.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import ch.qos.logback.classic.{Level, LoggerContext}
4 | import org.slf4j.LoggerFactory
5 |
6 | import scala.language.implicitConversions
7 |
8 |
9 | object SyncLogging {
10 |
11 | var colorActive: Boolean = true
12 |
13 | def initLogger(conf: JobDefinition): Unit = {
14 | colorActive = !conf.noColor.getOrElse(false)
15 | setRootLogLevel(conf.verbose.getOrElse(0))
16 | }
17 |
18 | /** sets the log level based on -vvv amount */
19 | def setRootLogLevel(levelInt: Int): Unit = {
20 | val level = levelInt match {
21 | case 0 => Level.INFO
22 | case 1 => Level.DEBUG
23 | case _ => Level.TRACE
24 | }
25 | val iLoggerFactory = LoggerFactory.getILoggerFactory
26 | iLoggerFactory match {
27 | case loggerContext: LoggerContext =>
28 | val rootLogger = loggerContext.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME)
29 | rootLogger.setLevel(level)
30 | case _ =>
31 | throw new IllegalStateException(s"Unexpected ILoggerFactory implementation: ${iLoggerFactory.getClass}")
32 | }
33 | }
34 |
35 |
36 | trait Logger {
37 | lazy val log = org.slf4j.LoggerFactory.getLogger(getClass)
38 |
39 | implicit def hasRainbow(s: String): RainbowString = new RainbowString(s)
40 |
41 | }
42 |
43 | class RainbowString(s: String) {
44 |
45 | import Console._
46 |
47 | /** Colorize the given string foreground to ANSI black */
48 | def black = if (colorActive) {
49 | BLACK + s + RESET
50 | } else {
51 | s
52 | }
53 |
54 | /** Colorize the given string foreground to ANSI red */
55 | def red = if (colorActive) {
56 | RED + s + RESET
57 | } else {
58 | s
59 | }
60 |
61 | /** Colorize the given string foreground to ANSI red */
62 | def green = if (colorActive) {
63 | GREEN + s + RESET
64 | } else {
65 | s
66 | }
67 |
68 | /** Colorize the given string foreground to ANSI red */
69 | def yellow = if (colorActive) {
70 | YELLOW + s + RESET
71 | } else {
72 | s
73 | }
74 |
75 | /** Colorize the given string foreground to ANSI red */
76 | def blue = if (colorActive) {
77 | BLUE + s + RESET
78 | } else {
79 | s
80 | }
81 |
82 | /** Colorize the given string foreground to ANSI red */
83 | def magenta = if (colorActive) {
84 | MAGENTA + s + RESET
85 | } else {
86 | s
87 | }
88 |
89 | /** Colorize the given string foreground to ANSI red */
90 | def cyan = if (colorActive) {
91 | CYAN + s + RESET
92 | } else {
93 | s
94 | }
95 |
96 | /** Make the given string bold */
97 | def bold = if (colorActive) {
98 | BOLD + s + RESET
99 | } else {
100 | s
101 | }
102 | }
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/SyncUtil.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import com.typesafe.config.{Config, ConfigFactory}
4 | import io.github.starofall.s3hypersync.MainApp.log
5 | import io.github.starofall.s3hypersync.SyncLogging._
6 | import io.github.starofall.s3hypersync.SyncModel.{FileSyncState, SyncStatus}
7 | import org.apache.pekko.actor.{ActorSystem, Cancellable, Scheduler, Terminated}
8 | import org.apache.pekko.pattern.after
9 |
10 | import scala.concurrent.duration.{DurationInt, FiniteDuration}
11 | import scala.concurrent.{ExecutionContext, Future}
12 | import scala.util.{Failure, Success, Try}
13 |
14 | object SyncUtil extends Logger {
15 |
16 | def handleFinalResult(result: Try[_])
17 | (implicit actorSystem: ActorSystem): Future[Terminated] = {
18 | result match {
19 | case Failure(exception) =>
20 | log.error("Error Running Sync")
21 | exception.printStackTrace(System.err)
22 | actorSystem.registerOnTermination(() => System.exit(1))
23 | actorSystem.terminate()
24 | case Success(_) =>
25 | actorSystem.registerOnTermination(() => System.exit(0))
26 | actorSystem.terminate()
27 | }
28 |
29 | }
30 |
31 | /** if defined, we add a time bomb/timeout to our execution */
32 | def addTimeoutIfNeeded(conf: JobDefinition)
33 | (implicit actorSystem: ActorSystem, exc: ExecutionContext): Option[Cancellable] = {
34 | // build a time-bomb for timeout
35 | conf.timeout.toOption.map(timeoutSeconds => {
36 | log.info("[TIMEOUT]".yellow + s" Set a timeout of $timeoutSeconds seconds".magenta)
37 | actorSystem.scheduler.scheduleOnce(timeoutSeconds.seconds) {
38 | log.error("[CRITICAL ERROR] THE PROCESS DID NOT FINISH - HAD TO SELF-KILL".red)
39 | System.exit(2)
40 | }
41 | })
42 | }
43 |
44 |
45 | /** simple retry util in the pekko world */
46 | def retry[T](retries: Int, delay: FiniteDuration)
47 | (f: => Future[T])(implicit ec: ExecutionContext, scheduler: Scheduler): Future[T] = {
48 | f.recoverWith {
49 | case _ if retries > 0 =>
50 | log.debug("[RETRY] Had to retry...")
51 | after(delay, scheduler)(retry(retries - 1, delay)(f))
52 | }
53 | }
54 |
55 |
56 | /** creates an adjusted config for pekko for our desired worker pool size */
57 | def buildConfig(conf: JobDefinition): Config = {
58 | ConfigFactory.parseString(
59 | s"""pekko {
60 | loglevel = "ERROR"
61 | stdout-loglevel = "ERROR"
62 | actor {
63 | default-dispatcher {
64 | type = Dispatcher
65 | executor = "thread-pool-executor"
66 | thread-pool-executor {
67 | fixed-pool-size = ${conf.numWorkers.getOrElse(64) + 4}
68 | }
69 | throughput = 1000
70 | }
71 | }
72 | coordinated-shutdown.log-info = off
73 | http.host-connection-pool = {
74 | max-connections = ${conf.numWorkers.getOrElse(64) * 2 + 10}
75 | max-open-requests = ${conf.numWorkers.getOrElse(64) * 4 + 10}
76 | }
77 | }""")
78 | }
79 |
80 |
81 |
82 | }
83 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/SyncS3Settings.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import com.typesafe.config.ConfigFactory
4 | import org.apache.pekko.stream.connectors.s3.S3Settings
5 |
6 | import java.nio.file.Files
7 | import scala.jdk.CollectionConverters.MapHasAsJava
8 |
9 | object SyncS3Settings {
10 |
11 | /** extracts the job definition source as s3 settings */
12 | def sourceConfig(d: JobDefinition, isHugeFile: Boolean = false): S3Settings = {
13 | buildS3Settings(d.sourceKey.toOption.get,
14 | d.sourceSecret.toOption.get,
15 | d.sourceRegion.toOption.get,
16 | d.sourceEndpoint.toOption,
17 | d.sourcePathStyle.toOption.getOrElse(false),
18 | isHugeFile)
19 | }
20 |
21 | /** extracts the job definition target as s3 settings */
22 | def targetConfig(d: JobDefinition,
23 | isHugeFile: Boolean = false): S3Settings = {
24 | buildS3Settings(d.targetKey.toOption.get,
25 | d.targetSecret.toOption.get,
26 | d.targetRegion.toOption.get,
27 | d.targetEndpoint.toOption,
28 | d.targetPathStyle.toOption.getOrElse(false),
29 | isHugeFile)
30 | }
31 |
32 | /** creates a pekko config object */
33 | private def buildS3Settings(keyId: String,
34 | accessKey: String,
35 | region: String,
36 | endpointOverwrite: Option[String],
37 | usePathAccessStyle: Boolean,
38 | isHugeFile: Boolean = false): S3Settings = {
39 | val settingMap = scala.collection.mutable.Map(
40 | "buffer" -> "memory",
41 | "validate-object-key" -> "true",
42 | "retry-settings.max-retries" -> 6,
43 | "retry-settings.min-backoff" -> "200ms",
44 | "retry-settings.max-backoff" -> "10s",
45 | "retry-settings.random-factor" -> 0.0,
46 | "multipart-upload.retry-settings.max-retries" -> 6,
47 | "multipart-upload.retry-settings.min-backoff" -> "200ms",
48 | "multipart-upload.retry-settings.max-backoff" -> "10s",
49 | "multipart-upload.retry-settings.random-factor" -> 0.0,
50 | "sign-anonymous-requests" -> true,
51 | "access-style" -> "virtual",
52 | "aws.region.provider" -> "static",
53 | "aws.region.default-region" -> region,
54 | "aws.credentials.provider" -> "static",
55 | "aws.credentials.access-key-id" -> keyId,
56 | "aws.credentials.secret-access-key" -> accessKey
57 | )
58 | // on huge files we use file buffering (else we might run OOM)
59 | if (isHugeFile) {
60 | settingMap.update("buffer", "disk")
61 | settingMap.update("disk-buffer-path", Files.createTempDirectory("s3hypersync").toAbsolutePath.toString)
62 | }
63 | // allow setting a custom endpoint
64 | if (endpointOverwrite.isDefined) {
65 | settingMap.update("endpoint-url", endpointOverwrite.get)
66 | }
67 | // for legacy storage systems like minio, we can use path style
68 | if (usePathAccessStyle) {
69 | settingMap.update("access-style", "path")
70 | }
71 | S3Settings.create(ConfigFactory.parseMap(settingMap.asJava))
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/SyncCommand.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import io.github.starofall.s3hypersync.PekkoFileSyncCompareStage.createSyncSource
4 | import io.github.starofall.s3hypersync.SyncLogging.Logger
5 | import io.github.starofall.s3hypersync.SyncModel.{FileSyncState, SyncStatus}
6 | import io.github.starofall.s3hypersync.SyncUtil.retry
7 | import org.apache.pekko.actor.ActorSystem
8 | import org.apache.pekko.stream.scaladsl.{Merge, Source}
9 |
10 | import scala.concurrent.duration.DurationInt
11 | import scala.concurrent.{ExecutionContext, Future}
12 |
13 | class SyncCommand(conf: JobDefinition)
14 | (implicit actorSystem: ActorSystem, exc: ExecutionContext)
15 | extends Logger {
16 |
17 | implicit val statistics: SyncStatistics = new SyncStatistics(conf)
18 |
19 | /** runs the main sync job */
20 | def runSyncJob(): Future[Unit] = {
21 | createSource()
22 | .wireTap(x=>statistics.statCall(x))
23 | .filter(syncFilter)
24 | .mapAsyncUnordered(conf.numWorkers())(x=>handleFileSync(x))
25 | .run()
26 | .map(_ => statistics.printFinalStatistics())
27 | }
28 |
29 | private def createSource() = {
30 | if (conf.uuidBoost.toOption.getOrElse(false)) {
31 | createUUIDBoosterSource()
32 | } else {
33 | createSyncSource(conf, None)
34 | }
35 | }
36 |
37 | /** if the prefix contains only UUIDs, we can just create 16 sources and merge them */
38 | private def createUUIDBoosterSource() = {
39 | assert(conf.sourcePrefix.isDefined, "UUID booster requires source prefix")
40 | assert(conf.targetPrefix.isDefined, "UUID booster requires target prefix")
41 | // if we know that the folder contains UUIDs,
42 | // we can active the iteration booster,
43 | // which will spawn multiple iteration calls for each
44 | // first character -> multiplexing the blocking calls for next1k
45 | // as we still compare the same prefix in the code itself,
46 | // the code should still work
47 | val extraPrefixedSources = "0123456789abcdef"
48 | .toCharArray.toList.map(c => createSyncSource(conf, Some(c.toString)))
49 | Source.combine(Source.empty, extraPrefixedSources.head, extraPrefixedSources.tail: _*)(Merge(_))
50 | }
51 |
52 |
53 | /** handles the sync of an individual file */
54 | private def handleFileSync(x: FileSyncState) = {
55 | retry(retries = 3, delay = 2.seconds) {
56 | if (conf.dryRun.getOrElse(false)) {
57 | log.info("[DRYRUN-COPY]".yellow + s" ${x.file.key.green} ${"->"} " +
58 | s"${conf.targetPrefix.getOrElse("").magenta + x.file.relativeKey.cyan}")
59 | Future.successful(())
60 | } else {
61 | log.debug("[COPY-START] " + x.file.key + " -> "
62 | + conf.targetPrefix.getOrElse("") + x.file.relativeKey)
63 | S3Connector.copyFile(
64 | conf,
65 | x.file.key,
66 | conf.targetPrefix.getOrElse("") + x.file.relativeKey,
67 | x.file.size).map { _ =>
68 | log.debug("[COPY-SUCCESS] " + x.file.key)
69 | statistics.incrementFilesCopied(x.file.size)
70 | }
71 | }
72 | }(actorSystem.dispatcher, actorSystem.scheduler)
73 | }
74 |
75 |
76 | /** A utility method to filter file synchronization based on the configuration job definitions */
77 | private def syncFilter(x: FileSyncState): Boolean = {
78 | conf.syncMode() match {
79 | case "ALWAYS" => true // take all
80 | case "CHANGED" => x.status == SyncStatus.SizeChanged || x.status == SyncStatus.Missing
81 | case "MISSING" => x.status == SyncStatus.Missing
82 | }
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # S3HyperSync
2 | S3HyperSync is a high-performance, memory-efficient, and cost-effective tool for synchronizing files between S3-compatible storage services. Optimized for speed, reliability, and minimizing AWS costs, it's ideal for large-scale data synchronization and backup tasks. Utilizing Pekko, it adopts a stream-only approach to maintain low memory requirements.
3 |
4 | ## Origin
5 | Developed for creating daily backups of huge S3 buckets with millions of files and terabyte of data to an seperate AWS account.
6 |
7 | ## Cost Effective
8 | To sync large S3 buckets, S3HyperSync compares directories using two iterator streams for the source and target, reducing the need for costly GetObject requests, especially for DEEP_ARCHIVE storage.
9 | It minimizes expensive MultiPart uploads, as they count as multiple PutObject calls.
10 |
11 | ## Performance
12 | Performance tests on AWS Fargate show iteration speeds between 8,000 to 100,000 files per second with the UUID booster feature.
13 | Copy speeds reach around 600MB/s on a c6gn.4xlarge or 800 files per second for smaller files.
14 |
15 | ## UUID Booster
16 | The UUID booster feature can be used if data is suffixed with a uuid. E.g. s3://bucket/videos/$UUID
17 | In this case the tool creates 16 iterators and processes them in parallel for extremly fast bucket comparison.
18 |
19 | ## Installation
20 |
21 | Download the JAR file from the Release Section or build it yourself with sbt assembly.
22 |
23 | ## Usage
24 |
25 | ```
26 | S3HyperSync.jar 0.1.5
27 | Usage: java -jar S3HyperSync.jar [OPTIONS]
28 | A fast, efficient, cost-reducing, and memory-efficient S3 sync tool.
29 | Options:
30 | --dry-run Show what would be copied without actually
31 | copying
32 | --multipart-size Size of each part in a multipart upload (in
33 | bytes)
34 | --no-color Disable colored output
35 | --put-cutoff-size Files larger than this size (in bytes) are
36 | uploaded using multipart
37 | --source-bucket Source S3 Bucket
38 | --source-endpoint Source S3 Endpoint
39 | --source-key Source S3 Key
40 | --source-path-style Use path style for source S3
41 | --source-prefix Source S3 Prefix (must end with /)
42 | --source-region Source S3 Region
43 | --source-secret Source S3 Secret
44 | --storage-tier Storage tier: STANDARD, INTELLIGENT_TIERING,
45 | GLACIER_IR, GLACIER_IR_AUTO, DEEP_ARCHIVE,
46 | DEEP_ARCHIVE_AUTO
47 | --sync Sync mode: ALWAYS, MISSING, CHANGED
48 | --target-bucket Target S3 Bucket
49 | --target-endpoint Target S3 Endpoint
50 | --target-key Target S3 Key
51 | --target-path-style Use path style for target S3
52 | --target-prefix Target S3 Prefix (must end with /)
53 | --target-region Target S3 Region
54 | --target-secret Target S3 Secret
55 | --timeout Kills the process after N seconds
56 | --uuid-boost Increase index speed if source prefix contains
57 | UUIDs
58 | -v, --verbose Verbose level (use multiple -v for increased
59 | verbosity)
60 | --workers Number of workers
61 | -h, --help Show help message
62 | --version Show version of this program
63 | ```
64 | ## Contributing
65 |
66 | We welcome contributions from the community. If you find a bug or have a feature request, please open an issue on
67 | GitHub. If you want to contribute code, please fork the repository and submit a pull request.
68 |
69 | ## License
70 |
71 | S3HyperSync is released under the MIT License. See the LICENSE file for more details.
72 |
73 | ## Acknowledgements
74 |
75 | We would like to thank all the contributors and the open-source community for their support and contributions to this
76 | project.
77 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/S3Connector.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import io.github.starofall.s3hypersync.SyncLogging.Logger
4 | import io.github.starofall.s3hypersync.SyncModel.SyncFile
5 | import org.apache.pekko.NotUsed
6 | import org.apache.pekko.actor.ActorSystem
7 | import org.apache.pekko.stream.connectors.s3._
8 | import org.apache.pekko.stream.connectors.s3.scaladsl.S3
9 | import org.apache.pekko.stream.scaladsl.{Sink, Source}
10 | import org.apache.pekko.util.ByteString
11 |
12 | import scala.concurrent.Future
13 |
14 | object S3Connector extends Logger {
15 |
16 | /** copies a file from the source to the target bucket */
17 | def copyFile(job: JobDefinition,
18 | sKey: String,
19 | tKey: String,
20 | fileSize: Long)
21 | (implicit actorSystem: ActorSystem,
22 | statistics: SyncStatistics): Future[Any] = {
23 | // For large files, use disk buffering for the source stream to prevent OutOfMemoryError
24 | val isHugeFile = fileSize >= job.putCutoffSize.toOption.getOrElse(52428800)
25 | val s3Source: Source[ByteString, Future[ObjectMetadata]] = S3
26 | .getObject(job.sourceBucket.toOption.get, sKey)
27 | .withAttributes(S3Attributes.settings(SyncS3Settings.sourceConfig(job, isHugeFile)))
28 |
29 | val storageString = {
30 | job.storageTier.getOrElse("STANDARD") match {
31 | case "STANDARD" => "STANDARD"
32 | case "INTELLIGENT_TIERING" => "INTELLIGENT_TIERING"
33 | case "GLACIER_IA" => "GLACIER_IA"
34 | case "GLACIER_IA_AUTO" =>
35 | if (fileSize < 128 * 1024) {
36 | "STANDARD"
37 | } else {
38 | "GLACIER_IA"
39 | }
40 | case "DEEP_ARCHIVE" => "DEEP_ARCHIVE"
41 | case "DEEP_ARCHIVE_AUTO" =>
42 | if (fileSize < 128 * 1024) {
43 | "STANDARD"
44 | } else {
45 | "DEEP_ARCHIVE"
46 | }
47 | case _ => throw new Exception("INVALID_STORAGE_TIER")
48 | }
49 | }
50 |
51 | if (fileSize < job.putCutoffSize.toOption.getOrElse(52428800)) {
52 | log.trace(s"[COPY-PUT] ${job.sourceBucket.toOption.get} / $sKey -> ${job.targetBucket.toOption.get} / $tKey")
53 | statistics.incrementAwsPutRequests(1)
54 | S3.putObject(job.targetBucket.toOption.get, tKey,
55 | s3Headers = S3Headers().withCustomHeaders(Map("x-amz-storage-class" -> storageString)),
56 | data = s3Source,
57 | contentLength = fileSize)
58 | .withAttributes(S3Attributes.settings(SyncS3Settings.targetConfig(job)))
59 | .run()
60 | } else {
61 | log.trace(s"[COPY-MULTIPART] ${job.sourceBucket.toOption.get} / $sKey -> ${job.targetBucket.toOption.get} / $tKey")
62 | val multiPartChunkSize = job.multipartSize.getOrElse(52428800)
63 | statistics.incrementAwsPutRequests(2 + Math.max(1, (fileSize / multiPartChunkSize).toInt))
64 | val s3Sink: Sink[ByteString, Future[MultipartUploadResult]] = S3
65 | .multipartUploadWithHeaders(
66 | job.targetBucket.toOption.get, tKey,
67 | chunkSize = multiPartChunkSize,
68 | s3Headers = S3Headers().withCustomHeaders(Map("x-amz-storage-class" -> storageString)))
69 | .withAttributes(S3Attributes.settings(SyncS3Settings.targetConfig(job, isHugeFile = true)))
70 | s3Source.runWith(s3Sink)
71 | }
72 | }
73 |
74 | def listBucket(bucket: String,
75 | prefix: Option[String],
76 | subPrefix: Option[String], // an additional prefix that does not influence relative DIR
77 | s3Settings: S3Settings): Source[SyncFile, NotUsed] = {
78 | val searchPrefix = (prefix, subPrefix) match {
79 | case (Some(x), Some(y)) => Some(x + y)
80 | case (None, Some(y)) => Some(y)
81 | case (Some(x), None) => Some(x)
82 | case _ => None
83 | }
84 | S3.listBucket(bucket, searchPrefix).withAttributes(S3Attributes.settings(s3Settings))
85 | .filterNot(x => x.size == 0 && x.key.endsWith("/")) // drop folders
86 | .map(x => SyncFile(
87 | x.bucketName, x.key, x.size,
88 | prefix match {
89 | case Some(value) => x.key.stripPrefix(value)
90 | case None => x.key
91 | }))
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/SyncStatistics.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import io.github.starofall.s3hypersync.SyncLogging.Logger
4 | import io.github.starofall.s3hypersync.SyncModel.{FileSyncState, SyncStatus}
5 | import org.apache.pekko.actor.{ActorSystem, Cancellable}
6 |
7 | import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
8 | import scala.concurrent.ExecutionContext
9 | import scala.concurrent.duration.DurationInt
10 |
11 | class SyncStatistics(conf: JobDefinition)
12 | (implicit actorSystem: ActorSystem, executionContext: ExecutionContext)
13 | extends Logger {
14 |
15 | val filesScanned = new AtomicInteger(0)
16 | val filesMissing = new AtomicInteger(0)
17 | val filesChanged = new AtomicInteger(0)
18 | val filesCopied = new AtomicInteger(0)
19 | val filesExisting = new AtomicInteger(0)
20 | val filesScannedLastSecond = new AtomicInteger(0)
21 | val awsPutRequests = new AtomicInteger(0)
22 | val bytesTransferredLastSecond: AtomicLong = new AtomicLong(0L)
23 | val totalBytesTransferred : AtomicLong = new AtomicLong(0L)
24 | var lastUpdateTime : Long = System.currentTimeMillis()
25 | var started = false
26 |
27 | initStatistics()
28 |
29 | def statCall(x: FileSyncState): Unit = {
30 | setStarted()
31 | x.status match {
32 | case SyncStatus.Missing => incrementFilesMissing()
33 | case SyncStatus.SizeChanged => incrementFilesChanged()
34 | case SyncStatus.Exists => incrementFilesExisting()
35 | }
36 | incrementFilesScanned()
37 | log.trace(x.status.toString + "->" + x.file.key)
38 | }
39 |
40 | def incrementFilesScanned(): Unit = {
41 | filesScanned.incrementAndGet()
42 | filesScannedLastSecond.incrementAndGet()
43 | }
44 |
45 | def incrementFilesMissing(): Unit = filesMissing.incrementAndGet()
46 |
47 | def incrementFilesChanged(): Unit = filesChanged.incrementAndGet()
48 |
49 | def incrementFilesExisting(): Unit = filesExisting.incrementAndGet()
50 |
51 | def setStarted(): Unit = started = true
52 |
53 | def incrementFilesCopied(size: Long): Unit = {
54 | filesCopied.incrementAndGet()
55 | bytesTransferredLastSecond.addAndGet(size)
56 | totalBytesTransferred.addAndGet(size)
57 | }
58 |
59 | def incrementAwsPutRequests(l: Int): Int = awsPutRequests.addAndGet(l)
60 |
61 | def printFinalStatistics(): Unit = {
62 | log.info("##############")
63 | log.info("## Sync Stats")
64 | log.info(s"# Missing | ${filesMissing.get().toString.yellow}")
65 | log.info(s"# Changed | ${filesChanged.get().toString.yellow}")
66 | log.info(s"# Exists | ${filesExisting.get().toString.yellow}")
67 | log.info("##############")
68 | log.info("## Copy Stats")
69 | log.info(s"# Files | ${filesCopied.get().toString.yellow}")
70 | log.info(s"# MB | ${(totalBytesTransferred.get() / 1024.0 / 1024.0).round.toString.yellow}")
71 | log.info("##############")
72 | log.info("## Cost Stats")
73 | log.info(s"# Puts | ${awsPutRequests.get().toString.yellow}")
74 | log.info("##############")
75 | }
76 |
77 | def initStatistics(): Cancellable = {
78 | log.info(s"[INIT] ".yellow +
79 | s"${conf.sourceBucket.toOption.get}/${conf.sourcePrefix.getOrElse("")} ".green +
80 | s"-> " +
81 | s"${conf.targetBucket.toOption.get}/${conf.targetPrefix.getOrElse("")}".cyan)
82 | // Schedule a task to print statistics every second
83 | actorSystem.scheduler.scheduleAtFixedRate(1.second, 5.second) {
84 | () => printStatistics()
85 | }
86 | }
87 |
88 | def printStatistics(): Unit = {
89 | val currentTime = System.currentTimeMillis()
90 | val duration = (currentTime - lastUpdateTime) / 1000.0
91 | val speed = filesScannedLastSecond.get() / duration
92 | val MBspeed = bytesTransferredLastSecond.get() / (1024.0 * 1024.0 * duration) // in MB/s
93 | lastUpdateTime = currentTime
94 | filesScannedLastSecond.set(0)
95 | bytesTransferredLastSecond.set(0) // Reset for the next interval
96 | if (started) {
97 | log.info(f"[STATS] ".yellow +
98 | f"Bandwidth: $MBspeed%.2f MB/s | ".magenta +
99 | f"Files: $speed%.2f f/sec".red +
100 | f" | Scanned: $filesScanned".cyan +
101 | f" | Copied: $filesCopied".cyan +
102 | f" | Missing: $filesMissing".cyan +
103 | f" | Changed: $filesChanged".cyan +
104 | f" | Existing: $filesExisting".cyan)
105 | }
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/JobDefinition.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import org.rogach.scallop.{ScallopConf, ScallopOption}
4 |
5 | /** CLI parser and config */
6 | class JobDefinition(arguments: Seq[String]) extends ScallopConf(arguments) {
7 | version("S3HyperSync.jar 0.1.7")
8 | noshort = true
9 | banner(
10 | """Usage: java -jar S3HyperSync.jar [OPTIONS]
11 | |A fast, efficient, cost-reducing, and memory-efficient S3 sync tool.
12 | |Options:
13 | |""".stripMargin)
14 | footer("\n")
15 |
16 | /** Number of workers */
17 | val numWorkers: ScallopOption[Int] = opt[Int](name = "workers", descr = "Number of workers", default = Some(64))
18 |
19 | /** Files larger than this size (in bytes) are uploaded using multipart */
20 | val putCutoffSize: ScallopOption[Int] = opt[Int](name = "put-cutoff-size", descr = "Files larger than this size (in bytes) are uploaded using multipart", default = Some(52428800))
21 |
22 | /** Size of each part in a multipart upload (in bytes) */
23 | val multipartSize: ScallopOption[Int] = opt[Int](name = "multipart-size", descr = "Size of each part in a multipart upload (in bytes)", default = Some(52428800))
24 |
25 | /** Sync mode: ALWAYS, IF_NOT_EXISTS, IF_SIZE_CHANGED */
26 | val syncMode: ScallopOption[String] = opt[String](name = "sync", descr = "Sync mode: ALWAYS, MISSING, CHANGED", validate = Set("ALWAYS", "MISSING", "CHANGED"), default = Some("MISSING"))
27 |
28 | /** Source S3 Key */
29 | val sourceKey: ScallopOption[String] = opt[String](name = "source-key", required = true, descr = "Source S3 Key")
30 |
31 | /** Source S3 Secret */
32 | val sourceSecret: ScallopOption[String] = opt[String](name = "source-secret", required = true, descr = "Source S3 Secret")
33 |
34 | /** Source S3 Region */
35 | val sourceRegion: ScallopOption[String] = opt[String](name = "source-region", required = true, descr = "Source S3 Region")
36 |
37 | /** Source S3 Bucket */
38 | val sourceBucket: ScallopOption[String] = opt[String](name = "source-bucket", required = true, descr = "Source S3 Bucket")
39 |
40 | /** Source S3 Prefix (must end with /) */
41 | val sourcePrefix: ScallopOption[String] = opt[String](name = "source-prefix", descr = "Source S3 Prefix (must end with /)", validate = _.endsWith("/"))
42 |
43 | /** Source S3 Endpoint */
44 | val sourceEndpoint: ScallopOption[String] = opt[String](name = "source-endpoint", descr = "Source S3 Endpoint")
45 |
46 | /** Use path style for source S3 */
47 | val sourcePathStyle: ScallopOption[Boolean] = opt[Boolean](name = "source-path-style", descr = "Use path style for source S3")
48 |
49 | /** Target S3 Key */
50 | val targetKey: ScallopOption[String] = opt[String](name = "target-key", required = true, descr = "Target S3 Key")
51 |
52 | /** Target S3 Secret */
53 | val targetSecret: ScallopOption[String] = opt[String](name = "target-secret", required = true, descr = "Target S3 Secret")
54 |
55 | /** Target S3 Region */
56 | val targetRegion: ScallopOption[String] = opt[String](name = "target-region", required = true, descr = "Target S3 Region")
57 |
58 | /** Target S3 Bucket */
59 | val targetBucket: ScallopOption[String] = opt[String](name = "target-bucket", required = true, descr = "Target S3 Bucket")
60 |
61 | /** Target S3 Prefix (must end with /) */
62 | val targetPrefix: ScallopOption[String] = opt[String](name = "target-prefix", descr = "Target S3 Prefix (must end with /)", validate = _.endsWith("/"))
63 |
64 | /** Target S3 Endpoint */
65 | val targetEndpoint: ScallopOption[String] = opt[String](name = "target-endpoint", descr = "Target S3 Endpoint")
66 |
67 | /** Use path style for target S3 */
68 | val targetPathStyle: ScallopOption[Boolean] = opt[Boolean](name = "target-path-style", descr = "Use path style for target S3")
69 |
70 | /** Storage tier: STANDARD, INTELLIGENT_TIERING, GLACIER_IR, GLACIER_IR_AUTO, DEEP_ARCHIVE, DEEP_ARCHIVE_AUTO */
71 | val storageTier: ScallopOption[String] = opt[String](name = "storage-tier", descr = "Storage tier: STANDARD, INTELLIGENT_TIERING, GLACIER_IR, GLACIER_IR_AUTO, DEEP_ARCHIVE, DEEP_ARCHIVE_AUTO", validate = Set("STANDARD", "INTELLIGENT_TIERING", "GLACIER_IR", "GLACIER_IR_AUTO", "DEEP_ARCHIVE", "DEEP_ARCHIVE_AUTO"), default = Some("STANDARD"))
72 |
73 | /** Verbose level (use multiple -v for increased verbosity) */
74 | val verbose: ScallopOption[Int] = tally(descr = "Verbose level (use multiple -v for increased verbosity)", short = 'v',noshort=false)
75 |
76 | /** Increase index speed if source prefix contains UUIDs */
77 | val uuidBoost: ScallopOption[Boolean] = opt[Boolean](name = "uuid-boost", descr = "Increase index speed if source prefix contains UUIDs", default = Some(false))
78 |
79 | /** Show what would be copied without actually copying */
80 | val dryRun: ScallopOption[Boolean] = opt[Boolean](name = "dry-run", descr = "Show what would be copied without actually copying", default = Some(false))
81 |
82 | /** Disable colored output */
83 | val noColor: ScallopOption[Boolean] = opt[Boolean](name = "no-color", descr = "Disable colored output", default = Some(false))
84 |
85 | /** Kill the process after N seconds */
86 | val timeout: ScallopOption[Int] = opt[Int](name = "timeout", descr = "Kills the process after N seconds")
87 |
88 | verify()
89 | }
90 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/starofall/s3hypersync/PekkoFileSyncCompareStage.scala:
--------------------------------------------------------------------------------
1 | package io.github.starofall.s3hypersync
2 |
3 | import io.github.starofall.s3hypersync.SyncLogging.Logger
4 | import io.github.starofall.s3hypersync.SyncModel.{FileSyncState, SyncFile, SyncStatus}
5 | import org.apache.pekko.NotUsed
6 | import org.apache.pekko.stream._
7 | import org.apache.pekko.stream.scaladsl.{GraphDSL, Source}
8 | import org.apache.pekko.stream.stage.{GraphStage, GraphStageLogic, InHandler, OutHandler}
9 |
10 | class PekkoFileSyncCompareStage extends GraphStage[FanInShape2[SyncFile, SyncFile, FileSyncState]] with Logger {
11 |
12 | //@formatter:disable
13 | val inA : Inlet[SyncFile] = Inlet("CompareAndFilterStage.inA")
14 | val inB : Inlet[SyncFile] = Inlet("CompareAndFilterStage.inB")
15 | val out : Outlet[FileSyncState] = Outlet("CompareAndFilterStage.out")
16 | val shape: FanInShape2[SyncFile, SyncFile, FileSyncState] = new FanInShape2(inA, inB, out)
17 | //@formatter:enable
18 |
19 | override def createLogic(inheritedAttributes: Attributes): GraphStageLogic = new GraphStageLogic(shape) {
20 |
21 | var aBuffer: Option[SyncFile] = None
22 | var bBuffer: Option[SyncFile] = None
23 | var aFinished = false
24 | var bFinished = false
25 |
26 | setHandler(inA, new InHandler {
27 | override def onPush(): Unit = {
28 | aBuffer = Some(grab(inA))
29 | log.trace(s"A Pushed - $aBuffer")
30 | compareAndPush()
31 | }
32 |
33 | override def onUpstreamFinish(): Unit = {
34 | log.trace("A upstream finished")
35 | aFinished = true
36 | compareAndPush()
37 | }
38 | })
39 |
40 | setHandler(inB, new InHandler {
41 | override def onPush(): Unit = {
42 | bBuffer = Some(grab(inB))
43 | log.trace(s"B Pushed - $bBuffer")
44 | compareAndPush()
45 | }
46 |
47 | override def onUpstreamFinish(): Unit = {
48 | log.trace("B upstream finished")
49 | bFinished = true
50 | compareAndPush()
51 | }
52 | })
53 |
54 | setHandler(out, new OutHandler {
55 | override def onPull(): Unit = {
56 | log.trace("OUT pull")
57 | if (aBuffer.isEmpty && !hasBeenPulled(inA) && !aFinished) {
58 | pull(inA)
59 | }
60 | if (bBuffer.isEmpty && !hasBeenPulled(inB) && !bFinished) {
61 | pull(inB)
62 | }
63 | compareAndPush()
64 | }
65 | })
66 |
67 | def compareAndPush(): Unit = {
68 | if (isAvailable(out)) {
69 | (aBuffer, bBuffer) match {
70 | // if a is slower then b, then a is definitely missing
71 | case (Some(a), Some(b)) if a.relativeKey < b.relativeKey =>
72 | log.trace("-> missing")
73 | push(out, FileSyncState(SyncStatus.Missing, a))
74 | clearAandPull()
75 |
76 | // if a == b and size is different, call changed
77 | case (Some(a), Some(b)) if a.relativeKey == b.relativeKey && a.size != b.size =>
78 | log.trace("-> sizechanged")
79 | // same but changing etags
80 | push(out, FileSyncState(SyncStatus.SizeChanged, a))
81 | clearAandPull()
82 |
83 | case (Some(a), Some(b)) if a.relativeKey == b.relativeKey => // aka same size
84 | log.trace("-> exists")
85 | // same key, same size
86 | push(out, FileSyncState(SyncStatus.Exists, a))
87 | clearAandPull()
88 |
89 | // this means a.relativeKey > b.relativeKey aka we need more B to continue
90 | case (Some(a), Some(b)) if !bFinished =>
91 | log.trace("-> we need more b")
92 | clearBandPull()
93 |
94 | // there is a last element in B, but b is done so we just drop it
95 | case (Some(a), Some(b)) => // aka if bFinished
96 | log.trace("-> ignore the last b")
97 | bBuffer = None
98 | push(out, FileSyncState(SyncStatus.Missing, a))
99 | clearAandPull()
100 |
101 | // if b is empty AND finished, all other A's are missing
102 | case (Some(a), None) if bFinished =>
103 | log.trace("-> b is empty")
104 | push(out, FileSyncState(SyncStatus.Missing, a))
105 | clearAandPull()
106 |
107 | // if b is empty but not finished, call for more Bs
108 | case (Some(a), None) => // aka if !bFinished
109 | log.trace("-> b empty (but not finished) need more bs")
110 | clearBandPull()
111 |
112 | // we still have As to call
113 | case (None, _) if !aFinished =>
114 | log.trace("-> a empty and more needed")
115 | clearAandPull()
116 |
117 | // a finished, so we are done
118 | case (None, _) =>
119 | log.trace("-> done a empty")
120 | completeStage()
121 | }
122 | }
123 | }
124 |
125 | private def clearBandPull(): Unit = {
126 | bBuffer = None
127 | if (!bFinished && !hasBeenPulled(inB)) pull(inB)
128 | }
129 |
130 | private def clearAandPull(): Unit = {
131 | aBuffer = None
132 | if (!aFinished && !hasBeenPulled(inA)) pull(inA)
133 | }
134 |
135 | override def preStart(): Unit = {
136 | pull(inA)
137 | pull(inB)
138 | }
139 | }
140 | }
141 |
142 | object PekkoFileSyncCompareStage {
143 |
144 | /** compares the files from the given sources against each other */
145 | def compareFilesToTarget(syncSource: Source[SyncFile, NotUsed],
146 | syncTarget: Source[SyncFile, NotUsed]): Source[FileSyncState, NotUsed] = {
147 | Source.fromGraph(GraphDSL.create() { implicit builder =>
148 | import GraphDSL.Implicits._
149 | val compareAndFilter = builder.add(new PekkoFileSyncCompareStage)
150 | syncSource ~> compareAndFilter.in0
151 | syncTarget ~> compareAndFilter.in1
152 | SourceShape(compareAndFilter.out)
153 | })
154 | }
155 |
156 | /** creates our source of files to sync from the job definition*/
157 | def createSyncSource(conf: JobDefinition, additionalPrefix: Option[String]): Source[FileSyncState, NotUsed] = {
158 | PekkoFileSyncCompareStage
159 | .compareFilesToTarget(
160 | S3Connector.listBucket(conf.sourceBucket.toOption.get,
161 | conf.sourcePrefix.toOption,
162 | additionalPrefix,
163 | SyncS3Settings.sourceConfig(conf))
164 | .buffer(10000, OverflowStrategy.backpressure).async,
165 | S3Connector.listBucket(conf.targetBucket.toOption.get,
166 | conf.targetPrefix.toOption,
167 | additionalPrefix,
168 | SyncS3Settings.targetConfig(conf))
169 | .buffer(10000, OverflowStrategy.backpressure).async)
170 | }
171 |
172 | }
173 |
--------------------------------------------------------------------------------